1 месяц назад · fe3056fbb3
--- a/backend/graph-service/src/main/java/com/lingyue/graph/service/GraphNerService.java
+++ b/backend/graph-service/src/main/java/com/lingyue/graph/service/GraphNerService.java
@@ -34,7 +34,6 @@ public class GraphNerService {
 
				     private final TextStorageRepository textStorageRepository;
			
 
				     private final GraphNodeRepository graphNodeRepository;
			
 
				     private final GraphRelationRepository graphRelationRepository;
			
 
				-    private final PositionMappingService positionMappingService;
			
 
				 
			
 
				     /**
			
 
				      * 获取文档的文本内容
			
@@ -99,17 +98,13 @@ public class GraphNerService {
 
				             node.setCreateTime(new Date());
			
 
				             node.setUpdateTime(new Date());
			
 
				             
			
 
				-            // 转换位置信息并补充页码/行号
			
 
				+            // 转换位置信息（直接使用字符偏移）
			
 
				             Object positionObj = entity.get("position");
			
 
				             if (positionObj instanceof Map) {
			
 
				                 @SuppressWarnings("unchecked")
			
 
				                 Map<String, Object> posMap = (Map<String, Object>) positionObj;
			
 
				-                
			
 
				-                // 使用 PositionMappingService 补充页码和行号
			
 
				-                Map<String, Object> enrichedPosition = enrichPosition(documentId, posMap);
			
 
				-                
			
 
				-                log.debug("实体位置信息: name={}, position={}", node.getName(), enrichedPosition);
			
 
				-                node.setPosition(enrichedPosition);
			
 
				+                log.debug("实体位置信息: name={}, position={}", node.getName(), posMap);
			
 
				+                node.setPosition(posMap);
			
 
				             } else {
			
 
				                 log.debug("实体无位置信息: name={}, positionObj={}", node.getName(), positionObj);
			
 
				             }
			
@@ -294,67 +289,6 @@ public class GraphNerService {
 
				         }
			
 
				     }
			
 
				 
			
 
				-    /**
			
 
				-     * 丰富位置信息，补充页码和行号
			
 
				-     * 
			
 
				-     * @param documentId 文档ID
			
 
				-     * @param posMap 原始位置信息（包含 charStart, charEnd）
			
 
				-     * @return 丰富后的位置信息（包含 charStart, charEnd, page, line）
			
 
				-     */
			
 
				-    private Map<String, Object> enrichPosition(String documentId, Map<String, Object> posMap) {
			
 
				-        // 如果已经有页码和行号，直接返回
			
 
				-        if (posMap.containsKey("page") && posMap.get("page") != null 
			
 
				-            && posMap.containsKey("line") && posMap.get("line") != null) {
			
 
				-            log.debug("位置信息已完整，跳过映射: documentId={}", documentId);
			
 
				-            return posMap;
			
 
				-        }
			
 
				-        
			
 
				-        // 获取字符位置
			
 
				-        Integer charStart = getIntValue(posMap, "charStart");
			
 
				-        Integer charEnd = getIntValue(posMap, "charEnd");
			
 
				-        
			
 
				-        if (charStart == null || charEnd == null) {
			
 
				-            log.debug("缺少字符位置信息，跳过映射: documentId={}, posMap={}", documentId, posMap);
			
 
				-            return posMap;
			
 
				-        }
			
 
				-        
			
 
				-        try {
			
 
				-            // 使用 PositionMappingService 映射页码和行号
			
 
				-            log.debug("开始位置映射: documentId={}, charStart={}, charEnd={}", documentId, charStart, charEnd);
			
 
				-            Map<String, Object> mappedPosition = positionMappingService.mapCharToPosition(
			
 
				-                documentId, charStart, charEnd);
			
 
				-            
			
 
				-            // 合并原始位置信息和映射结果
			
 
				-            Map<String, Object> enrichedPosition = new HashMap<>(posMap);
			
 
				-            enrichedPosition.putAll(mappedPosition);
			
 
				-            
			
 
				-            log.debug("位置映射完成: documentId={}, result={}", documentId, enrichedPosition);
			
 
				-            return enrichedPosition;
			
 
				-        } catch (Exception e) {
			
 
				-            log.warn("位置映射失败: documentId={}, charStart={}, charEnd={}, error={}", 
			
 
				-                    documentId, charStart, charEnd, e.getMessage());
			
 
				-            return posMap;
			
 
				-        }
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 从 Map 中获取整数值
			
 
				-     */
			
 
				-    private Integer getIntValue(Map<String, Object> map, String key) {
			
 
				-        Object value = map.get(key);
			
 
				-        if (value == null) {
			
 
				-            return null;
			
 
				-        }
			
 
				-        if (value instanceof Number) {
			
 
				-            return ((Number) value).intValue();
			
 
				-        }
			
 
				-        try {
			
 
				-            return Integer.parseInt(value.toString());
			
 
				-        } catch (NumberFormatException e) {
			
 
				-            return null;
			
 
				-        }
			
 
				-    }
			
 
				-    
			
 
				     /**
			
 
				      * 从 Map 中获取字符串值
			
 
				      */
			
--- a/backend/graph-service/src/main/java/com/lingyue/graph/service/PositionMappingService.java
+++ b/backend/graph-service/src/main/java/com/lingyue/graph/service/PositionMappingService.java
@@ -1,337 +0,0 @@
 
				-package com.lingyue.graph.service;
			
 
				-
			
 
				-import com.fasterxml.jackson.databind.JsonNode;
			
 
				-import com.fasterxml.jackson.databind.ObjectMapper;
			
 
				-import lombok.Data;
			
 
				-import lombok.extern.slf4j.Slf4j;
			
 
				-import org.springframework.beans.factory.annotation.Value;
			
 
				-import org.springframework.stereotype.Service;
			
 
				-
			
 
				-import java.nio.file.Files;
			
 
				-import java.nio.file.Path;
			
 
				-import java.util.HashMap;
			
 
				-import java.util.Map;
			
 
				-import java.util.concurrent.ConcurrentHashMap;
			
 
				-
			
 
				-/**
			
 
				- * 位置映射服务
			
 
				- * 根据文档索引将字符位置映射到页码和行号
			
 
				- * 
			
 
				- * @author lingyue
			
 
				- * @since 2026-01-20
			
 
				- */
			
 
				-@Slf4j
			
 
				-@Service
			
 
				-public class PositionMappingService {
			
 
				-    
			
 
				-    private final ObjectMapper objectMapper;
			
 
				-    
			
 
				-    @Value("${file.storage.text-path:/data/lingyue/texts}")
			
 
				-    private String textStoragePath;
			
 
				-    
			
 
				-    // 索引缓存，避免重复读取文件
			
 
				-    private final Map<String, DocumentIndex> indexCache = new ConcurrentHashMap<>();
			
 
				-    
			
 
				-    public PositionMappingService(ObjectMapper objectMapper) {
			
 
				-        this.objectMapper = objectMapper;
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 根据字符位置获取完整位置信息（包含页码和行号）
			
 
				-     * 
			
 
				-     * @param documentId 文档ID
			
 
				-     * @param charStart 字符起始位置
			
 
				-     * @param charEnd 字符结束位置
			
 
				-     * @return 位置信息 Map（包含 charStart, charEnd, page, line）
			
 
				-     */
			
 
				-    public Map<String, Object> mapCharToPosition(String documentId, int charStart, int charEnd) {
			
 
				-        Map<String, Object> position = new HashMap<>();
			
 
				-        position.put("charStart", charStart);
			
 
				-        position.put("charEnd", charEnd);
			
 
				-        
			
 
				-        // 尝试加载文档索引
			
 
				-        DocumentIndex index = loadDocumentIndex(documentId);
			
 
				-        if (index == null) {
			
 
				-            log.debug("未找到文档索引，返回仅包含字符位置的信息: documentId={}", documentId);
			
 
				-            return position;
			
 
				-        }
			
 
				-        
			
 
				-        // 查找页码
			
 
				-        int page = findPage(index, charStart);
			
 
				-        position.put("page", page);
			
 
				-        
			
 
				-        // 查找行号
			
 
				-        int line = findLine(index, charStart);
			
 
				-        position.put("line", line);
			
 
				-        
			
 
				-        // 计算全局行号（如果有行索引）
			
 
				-        if (index.getLines() != null && index.getLines().length > 0) {
			
 
				-            position.put("globalLine", line);
			
 
				-        }
			
 
				-        
			
 
				-        return position;
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 根据页码和行号获取字符位置范围
			
 
				-     * 
			
 
				-     * @param documentId 文档ID
			
 
				-     * @param page 页码
			
 
				-     * @param line 行号（页内行号）
			
 
				-     * @return 字符位置范围 [charStart, charEnd] 或 null
			
 
				-     */
			
 
				-    public int[] mapPageLineToChar(String documentId, int page, int line) {
			
 
				-        DocumentIndex index = loadDocumentIndex(documentId);
			
 
				-        if (index == null || index.getPages() == null) {
			
 
				-            return null;
			
 
				-        }
			
 
				-        
			
 
				-        // 找到对应页
			
 
				-        PageIndex pageIndex = null;
			
 
				-        for (PageIndex p : index.getPages()) {
			
 
				-            if (p.getPage() == page) {
			
 
				-                pageIndex = p;
			
 
				-                break;
			
 
				-            }
			
 
				-        }
			
 
				-        
			
 
				-        if (pageIndex == null) {
			
 
				-            return null;
			
 
				-        }
			
 
				-        
			
 
				-        // 如果有行索引，找到具体行
			
 
				-        if (index.getLines() != null) {
			
 
				-            // 计算目标全局行号
			
 
				-            int targetGlobalLine = pageIndex.getLineStart() + line - 1;
			
 
				-            for (LineIndex lineIndex : index.getLines()) {
			
 
				-                if (lineIndex.getLine() == targetGlobalLine) {
			
 
				-                    return new int[]{lineIndex.getCharStart(), lineIndex.getCharEnd()};
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				-        
			
 
				-        // 没有行索引，返回页的范围
			
 
				-        return new int[]{pageIndex.getCharStart(), pageIndex.getCharEnd()};
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 批量映射位置信息
			
 
				-     * 用于一次性处理多个实体的位置
			
 
				-     * 
			
 
				-     * @param documentId 文档ID
			
 
				-     * @param charPositions 字符位置列表 [[charStart1, charEnd1], [charStart2, charEnd2], ...]
			
 
				-     * @return 完整位置信息列表
			
 
				-     */
			
 
				-    public Map<String, Object>[] mapCharToPositionBatch(String documentId, int[][] charPositions) {
			
 
				-        // 预加载索引
			
 
				-        DocumentIndex index = loadDocumentIndex(documentId);
			
 
				-        
			
 
				-        @SuppressWarnings("unchecked")
			
 
				-        Map<String, Object>[] results = new Map[charPositions.length];
			
 
				-        
			
 
				-        for (int i = 0; i < charPositions.length; i++) {
			
 
				-            int charStart = charPositions[i][0];
			
 
				-            int charEnd = charPositions[i][1];
			
 
				-            
			
 
				-            Map<String, Object> position = new HashMap<>();
			
 
				-            position.put("charStart", charStart);
			
 
				-            position.put("charEnd", charEnd);
			
 
				-            
			
 
				-            if (index != null) {
			
 
				-                position.put("page", findPage(index, charStart));
			
 
				-                position.put("line", findLine(index, charStart));
			
 
				-            }
			
 
				-            
			
 
				-            results[i] = position;
			
 
				-        }
			
 
				-        
			
 
				-        return results;
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 清除缓存的索引
			
 
				-     */
			
 
				-    public void clearCache(String documentId) {
			
 
				-        indexCache.remove(documentId);
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 清除所有缓存
			
 
				-     */
			
 
				-    public void clearAllCache() {
			
 
				-        indexCache.clear();
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 加载文档索引
			
 
				-     */
			
 
				-    private DocumentIndex loadDocumentIndex(String documentId) {
			
 
				-        // 先检查缓存
			
 
				-        if (indexCache.containsKey(documentId)) {
			
 
				-            log.debug("从缓存加载索引: documentId={}", documentId);
			
 
				-            return indexCache.get(documentId);
			
 
				-        }
			
 
				-        
			
 
				-        // 构建索引文件路径
			
 
				-        String indexFilePath = buildIndexFilePath(documentId);
			
 
				-        Path path = Path.of(indexFilePath);
			
 
				-        
			
 
				-        log.debug("尝试加载索引文件: {}", indexFilePath);
			
 
				-        
			
 
				-        if (!Files.exists(path)) {
			
 
				-            log.info("索引文件不存在，无法补充页码信息: {}", indexFilePath);
			
 
				-            return null;
			
 
				-        }
			
 
				-        
			
 
				-        try {
			
 
				-            String json = Files.readString(path);
			
 
				-            JsonNode root = objectMapper.readTree(json);
			
 
				-            
			
 
				-            DocumentIndex index = new DocumentIndex();
			
 
				-            index.setDocumentId(root.path("documentId").asText(documentId));
			
 
				-            index.setTotalChars(root.path("totalChars").asInt(0));
			
 
				-            index.setTotalLines(root.path("totalLines").asInt(0));
			
 
				-            index.setTotalPages(root.path("totalPages").asInt(0));
			
 
				-            
			
 
				-            // 解析页面索引
			
 
				-            JsonNode pagesNode = root.path("pages");
			
 
				-            if (pagesNode.isArray()) {
			
 
				-                PageIndex[] pages = new PageIndex[pagesNode.size()];
			
 
				-                for (int i = 0; i < pagesNode.size(); i++) {
			
 
				-                    JsonNode pageNode = pagesNode.get(i);
			
 
				-                    PageIndex pageIndex = new PageIndex();
			
 
				-                    pageIndex.setPage(pageNode.path("page").asInt(i + 1));
			
 
				-                    pageIndex.setCharStart(pageNode.path("charStart").asInt(0));
			
 
				-                    pageIndex.setCharEnd(pageNode.path("charEnd").asInt(0));
			
 
				-                    pageIndex.setLineStart(pageNode.path("lineStart").asInt(1));
			
 
				-                    pageIndex.setLineEnd(pageNode.path("lineEnd").asInt(1));
			
 
				-                    pages[i] = pageIndex;
			
 
				-                }
			
 
				-                index.setPages(pages);
			
 
				-            }
			
 
				-            
			
 
				-            // 解析行索引
			
 
				-            JsonNode linesNode = root.path("lines");
			
 
				-            if (linesNode.isArray()) {
			
 
				-                LineIndex[] lines = new LineIndex[linesNode.size()];
			
 
				-                for (int i = 0; i < linesNode.size(); i++) {
			
 
				-                    JsonNode lineNode = linesNode.get(i);
			
 
				-                    LineIndex lineIndex = new LineIndex();
			
 
				-                    lineIndex.setLine(lineNode.path("line").asInt(i + 1));
			
 
				-                    lineIndex.setCharStart(lineNode.path("charStart").asInt(0));
			
 
				-                    lineIndex.setCharEnd(lineNode.path("charEnd").asInt(0));
			
 
				-                    lines[i] = lineIndex;
			
 
				-                }
			
 
				-                index.setLines(lines);
			
 
				-            }
			
 
				-            
			
 
				-            // 缓存
			
 
				-            indexCache.put(documentId, index);
			
 
				-            log.debug("已加载并缓存文档索引: documentId={}, pages={}, lines={}", 
			
 
				-                    documentId, 
			
 
				-                    index.getPages() != null ? index.getPages().length : 0,
			
 
				-                    index.getLines() != null ? index.getLines().length : 0);
			
 
				-            
			
 
				-            return index;
			
 
				-        } catch (Exception e) {
			
 
				-            log.error("加载文档索引失败: documentId={}", documentId, e);
			
 
				-            return null;
			
 
				-        }
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 构建索引文件路径
			
 
				-     */
			
 
				-    private String buildIndexFilePath(String documentId) {
			
 
				-        return Path.of(
			
 
				-                textStoragePath,
			
 
				-                documentId.substring(0, 2),
			
 
				-                documentId + "_index.json"
			
 
				-        ).toString();
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 使用二分查找页码
			
 
				-     */
			
 
				-    private int findPage(DocumentIndex index, int charPosition) {
			
 
				-        if (index.getPages() == null || index.getPages().length == 0) {
			
 
				-            return 1;
			
 
				-        }
			
 
				-        
			
 
				-        for (PageIndex page : index.getPages()) {
			
 
				-            if (charPosition >= page.getCharStart() && charPosition <= page.getCharEnd()) {
			
 
				-                return page.getPage();
			
 
				-            }
			
 
				-        }
			
 
				-        
			
 
				-        // 如果未找到，返回最后一页
			
 
				-        return index.getPages()[index.getPages().length - 1].getPage();
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 使用二分查找行号
			
 
				-     */
			
 
				-    private int findLine(DocumentIndex index, int charPosition) {
			
 
				-        if (index.getLines() == null || index.getLines().length == 0) {
			
 
				-            return 1;
			
 
				-        }
			
 
				-        
			
 
				-        // 二分查找
			
 
				-        int left = 0;
			
 
				-        int right = index.getLines().length - 1;
			
 
				-        
			
 
				-        while (left <= right) {
			
 
				-            int mid = (left + right) / 2;
			
 
				-            LineIndex line = index.getLines()[mid];
			
 
				-            
			
 
				-            if (charPosition < line.getCharStart()) {
			
 
				-                right = mid - 1;
			
 
				-            } else if (charPosition > line.getCharEnd()) {
			
 
				-                left = mid + 1;
			
 
				-            } else {
			
 
				-                return line.getLine();
			
 
				-            }
			
 
				-        }
			
 
				-        
			
 
				-        // 如果未找到，返回最近的行
			
 
				-        if (left >= index.getLines().length) {
			
 
				-            return index.getLines()[index.getLines().length - 1].getLine();
			
 
				-        }
			
 
				-        return index.getLines()[left].getLine();
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 文档索引
			
 
				-     */
			
 
				-    @Data
			
 
				-    public static class DocumentIndex {
			
 
				-        private String documentId;
			
 
				-        private PageIndex[] pages;
			
 
				-        private LineIndex[] lines;
			
 
				-        private int totalChars;
			
 
				-        private int totalLines;
			
 
				-        private int totalPages;
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 页面索引
			
 
				-     */
			
 
				-    @Data
			
 
				-    public static class PageIndex {
			
 
				-        private int page;
			
 
				-        private int charStart;
			
 
				-        private int charEnd;
			
 
				-        private int lineStart;
			
 
				-        private int lineEnd;
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 行索引
			
 
				-     */
			
 
				-    @Data
			
 
				-    public static class LineIndex {
			
 
				-        private int line;
			
 
				-        private int charStart;
			
 
				-        private int charEnd;
			
 
				-    }
			
 
				-}
			
--- a/backend/parse-service/src/main/java/com/lingyue/parse/service/DocumentIndexService.java
+++ b/backend/parse-service/src/main/java/com/lingyue/parse/service/DocumentIndexService.java
@@ -1,313 +0,0 @@
 
				-package com.lingyue.parse.service;
			
 
				-
			
 
				-import com.fasterxml.jackson.databind.ObjectMapper;
			
 
				-import lombok.Data;
			
 
				-import lombok.RequiredArgsConstructor;
			
 
				-import lombok.extern.slf4j.Slf4j;
			
 
				-import org.springframework.stereotype.Service;
			
 
				-
			
 
				-import java.nio.file.Files;
			
 
				-import java.nio.file.Path;
			
 
				-import java.nio.file.Paths;
			
 
				-import java.util.ArrayList;
			
 
				-import java.util.List;
			
 
				-
			
 
				-/**
			
 
				- * 文档索引服务
			
 
				- * 负责生成和管理文档的位置索引
			
 
				- * 
			
 
				- * @author lingyue
			
 
				- * @since 2026-01-20
			
 
				- */
			
 
				-@Slf4j
			
 
				-@Service
			
 
				-@RequiredArgsConstructor
			
 
				-public class DocumentIndexService {
			
 
				-    
			
 
				-    private final ObjectMapper objectMapper;
			
 
				-    
			
 
				-    /**
			
 
				-     * 分页符字符（Form Feed）
			
 
				-     */
			
 
				-    private static final char FORM_FEED = '\f';
			
 
				-    
			
 
				-    /**
			
 
				-     * 为纯文本生成行索引（Word/Excel等文档）
			
 
				-     * 如果文本中包含分页符(\f)，则根据分页符生成页面索引
			
 
				-     * 
			
 
				-     * @param text 文本内容
			
 
				-     * @param documentId 文档ID
			
 
				-     * @param indexOutputPath 索引输出路径
			
 
				-     * @return 文档索引
			
 
				-     */
			
 
				-    public DocumentIndex generateLineIndex(String text, String documentId, String indexOutputPath) {
			
 
				-        if (text == null || text.isEmpty()) {
			
 
				-            return createEmptyIndex(documentId);
			
 
				-        }
			
 
				-        
			
 
				-        // 检查是否包含分页符
			
 
				-        boolean hasPageBreaks = text.indexOf(FORM_FEED) >= 0;
			
 
				-        
			
 
				-        if (hasPageBreaks) {
			
 
				-            log.info("检测到分页符，生成分页索引: documentId={}", documentId);
			
 
				-            return generateIndexWithPageBreaks(text, documentId, indexOutputPath);
			
 
				-        } else {
			
 
				-            log.debug("无分页符，生成单页索引: documentId={}", documentId);
			
 
				-            return generateSinglePageIndex(text, documentId, indexOutputPath);
			
 
				-        }
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 根据分页符生成多页索引
			
 
				-     */
			
 
				-    private DocumentIndex generateIndexWithPageBreaks(String text, String documentId, String indexOutputPath) {
			
 
				-        List<PageIndex> pageIndices = new ArrayList<>();
			
 
				-        List<LineIndex> lineIndices = new ArrayList<>();
			
 
				-        
			
 
				-        int charPos = 0;
			
 
				-        int lineNum = 1;
			
 
				-        int pageNum = 1;
			
 
				-        int pageCharStart = 0;
			
 
				-        int pageLineStart = 1;
			
 
				-        
			
 
				-        String[] lines = text.split("\n", -1);
			
 
				-        
			
 
				-        for (String line : lines) {
			
 
				-            // 检查这一行是否包含分页符
			
 
				-            int ffIndex = line.indexOf(FORM_FEED);
			
 
				-            
			
 
				-            if (ffIndex >= 0) {
			
 
				-                // 处理分页符之前的内容
			
 
				-                if (ffIndex > 0) {
			
 
				-                    LineIndex lineIndex = new LineIndex();
			
 
				-                    lineIndex.setLine(lineNum);
			
 
				-                    lineIndex.setCharStart(charPos);
			
 
				-                    lineIndex.setCharEnd(charPos + ffIndex);
			
 
				-                    lineIndices.add(lineIndex);
			
 
				-                }
			
 
				-                
			
 
				-                // 结束当前页
			
 
				-                PageIndex pageIndex = new PageIndex();
			
 
				-                pageIndex.setPage(pageNum);
			
 
				-                pageIndex.setCharStart(pageCharStart);
			
 
				-                pageIndex.setCharEnd(charPos + ffIndex);
			
 
				-                pageIndex.setLineStart(pageLineStart);
			
 
				-                pageIndex.setLineEnd(lineNum);
			
 
				-                pageIndices.add(pageIndex);
			
 
				-                
			
 
				-                // 开始新页
			
 
				-                pageNum++;
			
 
				-                pageCharStart = charPos + ffIndex + 1; // +1 跳过分页符
			
 
				-                pageLineStart = lineNum + 1;
			
 
				-                
			
 
				-                // 处理分页符之后的内容（如果有）
			
 
				-                if (ffIndex + 1 < line.length()) {
			
 
				-                    lineNum++;
			
 
				-                    LineIndex afterLineIndex = new LineIndex();
			
 
				-                    afterLineIndex.setLine(lineNum);
			
 
				-                    afterLineIndex.setCharStart(charPos + ffIndex + 1);
			
 
				-                    afterLineIndex.setCharEnd(charPos + line.length());
			
 
				-                    lineIndices.add(afterLineIndex);
			
 
				-                }
			
 
				-            } else {
			
 
				-                // 普通行
			
 
				-                LineIndex lineIndex = new LineIndex();
			
 
				-                lineIndex.setLine(lineNum);
			
 
				-                lineIndex.setCharStart(charPos);
			
 
				-                lineIndex.setCharEnd(charPos + line.length());
			
 
				-                lineIndices.add(lineIndex);
			
 
				-            }
			
 
				-            
			
 
				-            charPos += line.length() + 1; // +1 for \n
			
 
				-            lineNum++;
			
 
				-        }
			
 
				-        
			
 
				-        // 添加最后一页
			
 
				-        if (pageCharStart < text.length()) {
			
 
				-            PageIndex lastPage = new PageIndex();
			
 
				-            lastPage.setPage(pageNum);
			
 
				-            lastPage.setCharStart(pageCharStart);
			
 
				-            lastPage.setCharEnd(text.length());
			
 
				-            lastPage.setLineStart(pageLineStart);
			
 
				-            lastPage.setLineEnd(lineNum - 1);
			
 
				-            pageIndices.add(lastPage);
			
 
				-        }
			
 
				-        
			
 
				-        // 创建文档索引
			
 
				-        DocumentIndex index = new DocumentIndex();
			
 
				-        index.setDocumentId(documentId);
			
 
				-        index.setTotalChars(text.length());
			
 
				-        index.setTotalLines(lineNum - 1);
			
 
				-        index.setTotalPages(pageIndices.size());
			
 
				-        index.setPages(pageIndices);
			
 
				-        index.setLines(lineIndices);
			
 
				-        
			
 
				-        log.info("分页索引生成完成: documentId={}, pages={}, lines={}", 
			
 
				-                documentId, pageIndices.size(), lineIndices.size());
			
 
				-        
			
 
				-        // 保存索引文件
			
 
				-        if (indexOutputPath != null) {
			
 
				-            saveIndexFile(index, indexOutputPath);
			
 
				-        }
			
 
				-        
			
 
				-        return index;
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 生成单页索引（无分页符的文档）
			
 
				-     */
			
 
				-    private DocumentIndex generateSinglePageIndex(String text, String documentId, String indexOutputPath) {
			
 
				-        List<LineIndex> lineIndices = new ArrayList<>();
			
 
				-        int charPos = 0;
			
 
				-        int lineNum = 1;
			
 
				-        
			
 
				-        String[] lines = text.split("\n", -1);
			
 
				-        for (String line : lines) {
			
 
				-            LineIndex lineIndex = new LineIndex();
			
 
				-            lineIndex.setLine(lineNum);
			
 
				-            lineIndex.setCharStart(charPos);
			
 
				-            lineIndex.setCharEnd(charPos + line.length());
			
 
				-            lineIndices.add(lineIndex);
			
 
				-            
			
 
				-            charPos += line.length() + 1; // +1 for \n
			
 
				-            lineNum++;
			
 
				-        }
			
 
				-        
			
 
				-        // 创建文档索引（无分页，只有行索引）
			
 
				-        DocumentIndex index = new DocumentIndex();
			
 
				-        index.setDocumentId(documentId);
			
 
				-        index.setTotalChars(text.length());
			
 
				-        index.setTotalLines(lines.length);
			
 
				-        index.setTotalPages(1); // 非分页文档统一为1页
			
 
				-        index.setLines(lineIndices);
			
 
				-        
			
 
				-        // 创建虚拟的单页索引
			
 
				-        List<PageIndex> pages = new ArrayList<>();
			
 
				-        PageIndex singlePage = new PageIndex();
			
 
				-        singlePage.setPage(1);
			
 
				-        singlePage.setCharStart(0);
			
 
				-        singlePage.setCharEnd(text.length());
			
 
				-        singlePage.setLineStart(1);
			
 
				-        singlePage.setLineEnd(lines.length);
			
 
				-        pages.add(singlePage);
			
 
				-        index.setPages(pages);
			
 
				-        
			
 
				-        // 保存索引文件
			
 
				-        if (indexOutputPath != null) {
			
 
				-            saveIndexFile(index, indexOutputPath);
			
 
				-        }
			
 
				-        
			
 
				-        return index;
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 读取索引文件
			
 
				-     */
			
 
				-    public DocumentIndex loadIndex(String indexFilePath) {
			
 
				-        try {
			
 
				-            Path path = Paths.get(indexFilePath);
			
 
				-            if (!Files.exists(path)) {
			
 
				-                log.warn("索引文件不存在: {}", indexFilePath);
			
 
				-                return null;
			
 
				-            }
			
 
				-            String json = Files.readString(path);
			
 
				-            return objectMapper.readValue(json, DocumentIndex.class);
			
 
				-        } catch (Exception e) {
			
 
				-            log.error("读取索引文件失败: {}", indexFilePath, e);
			
 
				-            return null;
			
 
				-        }
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 根据字符位置查找页码和行号
			
 
				-     * 
			
 
				-     * @param index 文档索引
			
 
				-     * @param charPosition 字符位置
			
 
				-     * @return [页码, 行号] 或 null（未找到）
			
 
				-     */
			
 
				-    public int[] findPageAndLine(DocumentIndex index, int charPosition) {
			
 
				-        if (index == null || index.getPages() == null) {
			
 
				-            return null;
			
 
				-        }
			
 
				-        
			
 
				-        // 1. 找到所在页
			
 
				-        int page = 1;
			
 
				-        for (PageIndex pageIndex : index.getPages()) {
			
 
				-            if (charPosition >= pageIndex.getCharStart() && charPosition <= pageIndex.getCharEnd()) {
			
 
				-                page = pageIndex.getPage();
			
 
				-                break;
			
 
				-            }
			
 
				-        }
			
 
				-        
			
 
				-        // 2. 找到所在行
			
 
				-        int line = 1;
			
 
				-        if (index.getLines() != null) {
			
 
				-            for (LineIndex lineIndex : index.getLines()) {
			
 
				-                if (charPosition >= lineIndex.getCharStart() && charPosition <= lineIndex.getCharEnd()) {
			
 
				-                    line = lineIndex.getLine();
			
 
				-                    break;
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				-        
			
 
				-        return new int[]{page, line};
			
 
				-    }
			
 
				-    
			
 
				-    private DocumentIndex createEmptyIndex(String documentId) {
			
 
				-        DocumentIndex index = new DocumentIndex();
			
 
				-        index.setDocumentId(documentId);
			
 
				-        index.setTotalChars(0);
			
 
				-        index.setTotalLines(0);
			
 
				-        index.setTotalPages(0);
			
 
				-        index.setPages(new ArrayList<>());
			
 
				-        return index;
			
 
				-    }
			
 
				-    
			
 
				-    private void saveIndexFile(DocumentIndex index, String outputPath) {
			
 
				-        try {
			
 
				-            Path path = Paths.get(outputPath);
			
 
				-            Files.createDirectories(path.getParent());
			
 
				-            String json = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(index);
			
 
				-            Files.writeString(path, json);
			
 
				-            log.info("文档索引文件已保存: {}", outputPath);
			
 
				-        } catch (Exception e) {
			
 
				-            log.error("保存文档索引文件失败: {}", outputPath, e);
			
 
				-        }
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 文档索引
			
 
				-     */
			
 
				-    @Data
			
 
				-    public static class DocumentIndex {
			
 
				-        private String documentId;
			
 
				-        private List<PageIndex> pages;
			
 
				-        private List<LineIndex> lines;
			
 
				-        private int totalChars;
			
 
				-        private int totalLines;
			
 
				-        private int totalPages;
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 页面索引
			
 
				-     */
			
 
				-    @Data
			
 
				-    public static class PageIndex {
			
 
				-        private int page;
			
 
				-        private int charStart;
			
 
				-        private int charEnd;
			
 
				-        private int lineStart;
			
 
				-        private int lineEnd;
			
 
				-        private boolean ocrUsed;
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 行索引
			
 
				-     */
			
 
				-    @Data
			
 
				-    public static class LineIndex {
			
 
				-        private int line;
			
 
				-        private int charStart;
			
 
				-        private int charEnd;
			
 
				-    }
			
 
				-}
			
--- a/backend/parse-service/src/main/java/com/lingyue/parse/service/ParseService.java
+++ b/backend/parse-service/src/main/java/com/lingyue/parse/service/ParseService.java
@@ -32,7 +32,6 @@ public class ParseService {
 
				     private final ExcelTextExtractionService excelTextExtractionService;
			
 
				     private final OcrResultParser ocrResultParser;
			
 
				     private final LayoutAnalysisService layoutAnalysisService;
			
 
				-    private final DocumentIndexService documentIndexService;
			
 
				     private final FileStorageProperties fileStorageProperties;
			
 
				     // 单体应用直接注入 Service，不使用 Feign Client
			
 
				     private final com.lingyue.graph.service.TextStorageService textStorageService;
			
@@ -97,12 +96,9 @@ public class ParseService {
 
				                 task.setProgress(20);
			
 
				                 saveParseTask(task);
			
 
				                 
			
 
				-                // PDF使用分页判断逻辑，并生成页面索引
			
 
				-                String indexFilePath = buildIndexFilePath(documentId);
			
 
				-                PdfTextExtractionService.ExtractionResult extractionResult = 
			
 
				-                    pdfTextExtractionService.extractTextWithIndex(sourceFilePath, documentId, indexFilePath);
			
 
				-                plainText = extractionResult.getText();
			
 
				-                log.info("PDF提取完成，索引文件: {}", indexFilePath);
			
 
				+                // PDF使用分页判断逻辑
			
 
				+                plainText = pdfTextExtractionService.extractText(sourceFilePath);
			
 
				+                log.info("PDF提取完成，文本长度: {}", plainText.length());
			
 
				             } else if (fileType == FileType.WORD || fileType == FileType.WORD_OLD) {
			
 
				                 log.info("处理Word文件: {}", sourceFilePath);
			
 
				                 task.setCurrentStep("word_extraction");
			
@@ -111,11 +107,7 @@ public class ParseService {
 
				                 
			
 
				                 // Word文档直接提取文本
			
 
				                 plainText = wordTextExtractionService.extractText(sourceFilePath);
			
 
				-                
			
 
				-                // 为Word生成行索引
			
 
				-                String indexFilePath = buildIndexFilePath(documentId);
			
 
				-                documentIndexService.generateLineIndex(plainText, documentId, indexFilePath);
			
 
				-                log.info("Word提取完成，索引文件: {}", indexFilePath);
			
 
				+                log.info("Word提取完成，文本长度: {}", plainText.length());
			
 
				             } else if (fileType == FileType.EXCEL || fileType == FileType.EXCEL_OLD) {
			
 
				                 log.info("处理Excel文件: {}", sourceFilePath);
			
 
				                 task.setCurrentStep("excel_extraction");
			
@@ -124,11 +116,7 @@ public class ParseService {
 
				                 
			
 
				                 // Excel表格直接提取文本
			
 
				                 plainText = excelTextExtractionService.extractText(sourceFilePath);
			
 
				-                
			
 
				-                // 为Excel生成行索引
			
 
				-                String indexFilePath = buildIndexFilePath(documentId);
			
 
				-                documentIndexService.generateLineIndex(plainText, documentId, indexFilePath);
			
 
				-                log.info("Excel提取完成，索引文件: {}", indexFilePath);
			
 
				+                log.info("Excel提取完成，文本长度: {}", plainText.length());
			
 
				             } else if (fileType.isImage()) {
			
 
				                 log.info("处理图片文件: {}", sourceFilePath);
			
 
				                 task.setCurrentStep("ocr");
			
@@ -283,18 +271,6 @@ public class ParseService {
 
				         return path.toString();
			
 
				     }
			
 
				     
			
 
				-    /**
			
 
				-     * 根据文档ID构建索引文件存储路径
			
 
				-     */
			
 
				-    private String buildIndexFilePath(String documentId) {
			
 
				-        Path path = Path.of(
			
 
				-                fileStorageProperties.getTextPath(),
			
 
				-                documentId.substring(0, 2),
			
 
				-                documentId + "_index.json"
			
 
				-        );
			
 
				-        return path.toString();
			
 
				-    }
			
 
				-
			
 
				     /**
			
 
				      * 将纯文本写入 TXT 文件
			
 
				      * 对于大文件使用分块写入，避免内存溢出
			
--- a/backend/parse-service/src/main/java/com/lingyue/parse/service/PdfTextExtractionService.java
+++ b/backend/parse-service/src/main/java/com/lingyue/parse/service/PdfTextExtractionService.java
@@ -1,8 +1,6 @@
 
				 package com.lingyue.parse.service;
			
 
				 
			
 
				-import com.fasterxml.jackson.databind.ObjectMapper;
			
 
				 import com.lingyue.common.exception.ServiceException;
			
 
				-import lombok.Data;
			
 
				 import lombok.RequiredArgsConstructor;
			
 
				 import lombok.extern.slf4j.Slf4j;
			
 
				 import org.apache.pdfbox.pdmodel.PDDocument;
			
@@ -16,7 +14,6 @@ import java.io.File;
 
				 import java.io.IOException;
			
 
				 import java.nio.file.Files;
			
 
				 import java.nio.file.Path;
			
 
				-import java.nio.file.Paths;
			
 
				 import java.util.ArrayList;
			
 
				 import java.util.List;
			
 
				 import java.util.UUID;
			
@@ -35,7 +32,6 @@ public class PdfTextExtractionService {
 
				     
			
 
				     private final PaddleOcrClient paddleOcrClient;
			
 
				     private final OcrResultParser ocrResultParser;
			
 
				-    private final ObjectMapper objectMapper;
			
 
				     
			
 
				     /**
			
 
				      * 文本阈值：每页至少需要这么多字符才认为有文本层
			
@@ -209,170 +205,6 @@ public class PdfTextExtractionService {
 
				         return combinedText.toString();
			
 
				     }
			
 
				     
			
 
				-    /**
			
 
				-     * 提取PDF文本并生成页面索引
			
 
				-     * 
			
 
				-     * @param pdfFilePath PDF文件路径
			
 
				-     * @param documentId 文档ID
			
 
				-     * @param indexOutputPath 索引文件输出路径（如果为null则不生成索引文件）
			
 
				-     * @return 提取结果（包含文本和索引）
			
 
				-     */
			
 
				-    public ExtractionResult extractTextWithIndex(String pdfFilePath, String documentId, String indexOutputPath) {
			
 
				-        File pdfFile = new File(pdfFilePath);
			
 
				-        if (!pdfFile.exists()) {
			
 
				-            throw new ServiceException("PDF文件不存在: " + pdfFilePath);
			
 
				-        }
			
 
				-        
			
 
				-        List<PageTextResult> pageResults = new ArrayList<>();
			
 
				-        
			
 
				-        try (PDDocument document = org.apache.pdfbox.Loader.loadPDF(pdfFile)) {
			
 
				-            int totalPages = document.getNumberOfPages();
			
 
				-            log.info("开始处理PDF文件（带索引）: {}, 总页数: {}", pdfFilePath, totalPages);
			
 
				-            
			
 
				-            PDFTextStripper textStripper = new PDFTextStripper();
			
 
				-            
			
 
				-            // 逐页处理
			
 
				-            for (int pageNum = 1; pageNum <= totalPages; pageNum++) {
			
 
				-                log.debug("处理第 {} 页/共 {} 页", pageNum, totalPages);
			
 
				-                
			
 
				-                try {
			
 
				-                    textStripper.setStartPage(pageNum);
			
 
				-                    textStripper.setEndPage(pageNum);
			
 
				-                    String pageText = textStripper.getText(document);
			
 
				-                    
			
 
				-                    if (hasSufficientText(pageText)) {
			
 
				-                        log.debug("第 {} 页有文本层，直接使用，文本长度: {}", pageNum, pageText.length());
			
 
				-                        pageResults.add(new PageTextResult(pageNum, pageText, false));
			
 
				-                    } else {
			
 
				-                        log.debug("第 {} 页文本不足，使用OCR处理", pageNum);
			
 
				-                        String ocrText = extractTextByOcr(pdfFilePath, pageNum);
			
 
				-                        pageResults.add(new PageTextResult(pageNum, ocrText, true));
			
 
				-                    }
			
 
				-                } catch (Exception e) {
			
 
				-                    log.error("处理第 {} 页时出错，尝试使用OCR", pageNum, e);
			
 
				-                    try {
			
 
				-                        String ocrText = extractTextByOcr(pdfFilePath, pageNum);
			
 
				-                        pageResults.add(new PageTextResult(pageNum, ocrText, true));
			
 
				-                    } catch (Exception ocrException) {
			
 
				-                        log.error("第 {} 页OCR也失败", pageNum, ocrException);
			
 
				-                        pageResults.add(new PageTextResult(pageNum, "", true));
			
 
				-                    }
			
 
				-                }
			
 
				-            }
			
 
				-        } catch (IOException e) {
			
 
				-            log.error("读取PDF文件失败: {}", pdfFilePath, e);
			
 
				-            throw new ServiceException("读取PDF文件失败: " + e.getMessage());
			
 
				-        }
			
 
				-        
			
 
				-        // 生成带索引的合并结果
			
 
				-        return combinePageTextsWithIndex(pageResults, documentId, indexOutputPath);
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 合并页面文本并生成索引
			
 
				-     */
			
 
				-    private ExtractionResult combinePageTextsWithIndex(List<PageTextResult> pageResults, 
			
 
				-                                                        String documentId, 
			
 
				-                                                        String indexOutputPath) {
			
 
				-        StringBuilder combinedText = new StringBuilder();
			
 
				-        List<PageIndex> pageIndices = new ArrayList<>();
			
 
				-        List<LineIndex> lineIndices = new ArrayList<>();
			
 
				-        
			
 
				-        int currentCharPos = 0;
			
 
				-        int currentLine = 1;
			
 
				-        
			
 
				-        for (PageTextResult result : pageResults) {
			
 
				-            int pageCharStart = currentCharPos;
			
 
				-            int pageLineStart = currentLine;
			
 
				-            
			
 
				-            if (result.getText() != null && !result.getText().trim().isEmpty()) {
			
 
				-                // 添加页头标记
			
 
				-                String pageHeader = "=== 第 " + result.getPageNum() + " 页";
			
 
				-                if (result.isOcrUsed()) {
			
 
				-                    pageHeader += " (OCR识别)";
			
 
				-                } else {
			
 
				-                    pageHeader += " (文本层提取)";
			
 
				-                }
			
 
				-                pageHeader += " ===\n";
			
 
				-                
			
 
				-                // 记录页头行的索引
			
 
				-                LineIndex headerLineIndex = new LineIndex();
			
 
				-                headerLineIndex.setLine(currentLine);
			
 
				-                headerLineIndex.setCharStart(currentCharPos);
			
 
				-                headerLineIndex.setCharEnd(currentCharPos + pageHeader.length() - 2); // -2 去掉 \n
			
 
				-                lineIndices.add(headerLineIndex);
			
 
				-                
			
 
				-                combinedText.append(pageHeader);
			
 
				-                currentCharPos += pageHeader.length();
			
 
				-                currentLine++;
			
 
				-                
			
 
				-                // 添加页面内容并记录每行索引
			
 
				-                String pageText = result.getText();
			
 
				-                String[] lines = pageText.split("\n", -1);
			
 
				-                for (String line : lines) {
			
 
				-                    LineIndex lineIndex = new LineIndex();
			
 
				-                    lineIndex.setLine(currentLine);
			
 
				-                    lineIndex.setCharStart(currentCharPos);
			
 
				-                    lineIndex.setCharEnd(currentCharPos + line.length());
			
 
				-                    lineIndices.add(lineIndex);
			
 
				-                    
			
 
				-                    currentCharPos += line.length() + 1; // +1 for \n
			
 
				-                    currentLine++;
			
 
				-                }
			
 
				-                
			
 
				-                // 添加额外的空行分隔符
			
 
				-                combinedText.append(pageText).append("\n\n");
			
 
				-                currentCharPos++; // 额外的 \n
			
 
				-                currentLine++;
			
 
				-            }
			
 
				-            
			
 
				-            // 创建页面索引
			
 
				-            PageIndex pageIndex = new PageIndex();
			
 
				-            pageIndex.setPage(result.getPageNum());
			
 
				-            pageIndex.setCharStart(pageCharStart);
			
 
				-            pageIndex.setCharEnd(currentCharPos - 1);
			
 
				-            pageIndex.setLineStart(pageLineStart);
			
 
				-            pageIndex.setLineEnd(currentLine - 1);
			
 
				-            pageIndex.setOcrUsed(result.isOcrUsed());
			
 
				-            pageIndices.add(pageIndex);
			
 
				-        }
			
 
				-        
			
 
				-        // 创建文档索引
			
 
				-        DocumentIndex documentIndex = new DocumentIndex();
			
 
				-        documentIndex.setDocumentId(documentId);
			
 
				-        documentIndex.setPages(pageIndices);
			
 
				-        documentIndex.setLines(lineIndices);
			
 
				-        documentIndex.setTotalChars(currentCharPos);
			
 
				-        documentIndex.setTotalLines(currentLine - 1);
			
 
				-        documentIndex.setTotalPages(pageResults.size());
			
 
				-        
			
 
				-        // 保存索引文件
			
 
				-        if (indexOutputPath != null) {
			
 
				-            saveIndexFile(documentIndex, indexOutputPath);
			
 
				-        }
			
 
				-        
			
 
				-        ExtractionResult result = new ExtractionResult();
			
 
				-        result.setText(combinedText.toString());
			
 
				-        result.setIndex(documentIndex);
			
 
				-        return result;
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 保存索引文件
			
 
				-     */
			
 
				-    private void saveIndexFile(DocumentIndex index, String outputPath) {
			
 
				-        try {
			
 
				-            Path path = Paths.get(outputPath);
			
 
				-            Files.createDirectories(path.getParent());
			
 
				-            String json = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(index);
			
 
				-            Files.writeString(path, json);
			
 
				-            log.info("页面索引文件已保存: {}", outputPath);
			
 
				-        } catch (Exception e) {
			
 
				-            log.error("保存页面索引文件失败: {}", outputPath, e);
			
 
				-            // 索引保存失败不影响主流程
			
 
				-        }
			
 
				-    }
			
 
				-    
			
 
				     /**
			
 
				      * 页面文本结果
			
 
				      */
			
@@ -399,49 +231,4 @@ public class PdfTextExtractionService {
 
				             return ocrUsed;
			
 
				         }
			
 
				     }
			
 
				-    
			
 
				-    /**
			
 
				-     * 提取结果（包含文本和索引）
			
 
				-     */
			
 
				-    @Data
			
 
				-    public static class ExtractionResult {
			
 
				-        private String text;
			
 
				-        private DocumentIndex index;
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 文档索引
			
 
				-     */
			
 
				-    @Data
			
 
				-    public static class DocumentIndex {
			
 
				-        private String documentId;
			
 
				-        private List<PageIndex> pages;
			
 
				-        private List<LineIndex> lines;
			
 
				-        private int totalChars;
			
 
				-        private int totalLines;
			
 
				-        private int totalPages;
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 行索引
			
 
				-     */
			
 
				-    @Data
			
 
				-    public static class LineIndex {
			
 
				-        private int line;
			
 
				-        private int charStart;
			
 
				-        private int charEnd;
			
 
				-    }
			
 
				-    
			
 
				-    /**
			
 
				-     * 页面索引
			
 
				-     */
			
 
				-    @Data
			
 
				-    public static class PageIndex {
			
 
				-        private int page;
			
 
				-        private int charStart;
			
 
				-        private int charEnd;
			
 
				-        private int lineStart;
			
 
				-        private int lineEnd;
			
 
				-        private boolean ocrUsed;
			
 
				-    }
			
 
				 }
			
--- a/backend/parse-service/src/main/java/com/lingyue/parse/service/WordTextExtractionService.java
+++ b/backend/parse-service/src/main/java/com/lingyue/parse/service/WordTextExtractionService.java
@@ -4,8 +4,8 @@ import com.lingyue.common.exception.ServiceException;
 
				 import lombok.extern.slf4j.Slf4j;
			
 
				 import org.apache.poi.hwpf.HWPFDocument;
			
 
				 import org.apache.poi.hwpf.extractor.WordExtractor;
			
 
				-import org.apache.poi.xwpf.usermodel.*;
			
 
				-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
			
 
				+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
			
 
				+import org.apache.poi.xwpf.usermodel.XWPFDocument;
			
 
				 import org.springframework.stereotype.Service;
			
 
				 
			
 
				 import java.io.File;
			
@@ -15,7 +15,6 @@ import java.io.IOException;
 
				 /**
			
 
				  * Word文档文本提取服务
			
 
				  * 支持.docx和.doc格式
			
 
				- * 在分页符位置插入 \f 字符，便于后续识别页码
			
 
				  * 
			
 
				  * @author lingyue
			
 
				  * @since 2026-01-14
			
@@ -24,16 +23,11 @@ import java.io.IOException;
 
				 @Service
			
 
				 public class WordTextExtractionService {
			
 
				     
			
 
				-    /**
			
 
				-     * 分页符字符
			
 
				-     */
			
 
				-    private static final char PAGE_BREAK = '\f';
			
 
				-    
			
 
				     /**
			
 
				      * 提取Word文档文本
			
 
				      * 
			
 
				      * @param wordFilePath Word文件路径
			
 
				-     * @return 提取的文本内容（分页符位置插入 \f）
			
 
				+     * @return 提取的文本内容
			
 
				      */
			
 
				     public String extractText(String wordFilePath) {
			
 
				         File wordFile = new File(wordFilePath);
			
@@ -45,7 +39,7 @@ public class WordTextExtractionService {
 
				         
			
 
				         try {
			
 
				             if (fileName.endsWith(".docx")) {
			
 
				-                return extractFromDocxWithPageBreaks(wordFilePath);
			
 
				+                return extractFromDocx(wordFilePath);
			
 
				             } else if (fileName.endsWith(".doc")) {
			
 
				                 return extractFromDoc(wordFilePath);
			
 
				             } else {
			
@@ -58,88 +52,23 @@ public class WordTextExtractionService {
 
				     }
			
 
				     
			
 
				     /**
			
 
				-     * 从.docx文件提取文本，保留分页符
			
 
				+     * 从.docx文件提取文本
			
 
				      */
			
 
				-    private String extractFromDocxWithPageBreaks(String filePath) throws IOException {
			
 
				-        log.info("提取.docx文件文本（含分页符）: {}", filePath);
			
 
				-        
			
 
				-        StringBuilder sb = new StringBuilder();
			
 
				-        int pageBreakCount = 0;
			
 
				+    private String extractFromDocx(String filePath) throws IOException {
			
 
				+        log.info("提取.docx文件文本: {}", filePath);
			
 
				         
			
 
				         try (FileInputStream fis = new FileInputStream(filePath);
			
 
				-             XWPFDocument document = new XWPFDocument(fis)) {
			
 
				-            
			
 
				-            // 遍历文档主体的所有元素
			
 
				-            for (IBodyElement element : document.getBodyElements()) {
			
 
				-                if (element instanceof XWPFParagraph) {
			
 
				-                    XWPFParagraph paragraph = (XWPFParagraph) element;
			
 
				-                    
			
 
				-                    // 检查段落中的分页符
			
 
				-                    for (XWPFRun run : paragraph.getRuns()) {
			
 
				-                        // 检查 run 中是否有分页符
			
 
				-                        CTR ctr = run.getCTR();
			
 
				-                        if (ctr != null) {
			
 
				-                            // 检查是否有硬分页符 (page break)
			
 
				-                            if (ctr.getBrList() != null) {
			
 
				-                                for (org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBr br : ctr.getBrList()) {
			
 
				-                                    if (br.getType() == org.openxmlformats.schemas.wordprocessingml.x2006.main.STBrType.PAGE) {
			
 
				-                                        sb.append(PAGE_BREAK);
			
 
				-                                        pageBreakCount++;
			
 
				-                                    }
			
 
				-                                }
			
 
				-                            }
			
 
				-                        }
			
 
				-                        
			
 
				-                        // 添加文本内容
			
 
				-                        String text = run.getText(0);
			
 
				-                        if (text != null) {
			
 
				-                            sb.append(text);
			
 
				-                        }
			
 
				-                    }
			
 
				-                    
			
 
				-                    // 检查段落后是否有分页符（通过段落属性）
			
 
				-                    if (paragraph.getCTP() != null && paragraph.getCTP().getPPr() != null) {
			
 
				-                        var pPr = paragraph.getCTP().getPPr();
			
 
				-                        // 检查分节符带来的分页
			
 
				-                        if (pPr.getSectPr() != null) {
			
 
				-                            var sectPr = pPr.getSectPr();
			
 
				-                            if (sectPr.getType() != null) {
			
 
				-                                String type = sectPr.getType().getVal().toString();
			
 
				-                                if ("nextPage".equals(type) || "oddPage".equals(type) || "evenPage".equals(type)) {
			
 
				-                                    sb.append(PAGE_BREAK);
			
 
				-                                    pageBreakCount++;
			
 
				-                                }
			
 
				-                            }
			
 
				-                        }
			
 
				-                    }
			
 
				-                    
			
 
				-                    sb.append("\n");
			
 
				-                    
			
 
				-                } else if (element instanceof XWPFTable) {
			
 
				-                    XWPFTable table = (XWPFTable) element;
			
 
				-                    for (XWPFTableRow row : table.getRows()) {
			
 
				-                        for (XWPFTableCell cell : row.getTableCells()) {
			
 
				-                            sb.append(cell.getText()).append("\t");
			
 
				-                        }
			
 
				-                        sb.append("\n");
			
 
				-                    }
			
 
				-                }
			
 
				-            }
			
 
				+             XWPFDocument document = new XWPFDocument(fis);
			
 
				+             XWPFWordExtractor extractor = new XWPFWordExtractor(document)) {
			
 
				             
			
 
				-            // 检查文档末尾的分节符
			
 
				-            if (document.getDocument().getBody().getSectPr() != null) {
			
 
				-                // 文档末尾的分节符不需要额外处理
			
 
				-            }
			
 
				+            String text = extractor.getText();
			
 
				+            log.debug("提取到文本长度: {}", text != null ? text.length() : 0);
			
 
				+            return text != null ? text : "";
			
 
				         }
			
 
				-        
			
 
				-        String result = sb.toString();
			
 
				-        log.info("提取完成: 文本长度={}, 分页符数量={}", result.length(), pageBreakCount);
			
 
				-        return result;
			
 
				     }
			
 
				     
			
 
				     /**
			
 
				      * 从.doc文件提取文本
			
 
				-     * .doc 格式的分页符通常会被 WordExtractor 保留为 \f
			
 
				      */
			
 
				     private String extractFromDoc(String filePath) throws IOException {
			
 
				         log.info("提取.doc文件文本: {}", filePath);
			
@@ -149,14 +78,7 @@ public class WordTextExtractionService {
 
				              WordExtractor extractor = new WordExtractor(document)) {
			
 
				             
			
 
				             String text = extractor.getText();
			
 
				-            int pageBreakCount = 0;
			
 
				-            if (text != null) {
			
 
				-                for (char c : text.toCharArray()) {
			
 
				-                    if (c == PAGE_BREAK) pageBreakCount++;
			
 
				-                }
			
 
				-            }
			
 
				-            log.info("提取完成: 文本长度={}, 分页符数量={}", 
			
 
				-                    text != null ? text.length() : 0, pageBreakCount);
			
 
				+            log.debug("提取到文本长度: {}", text != null ? text.length() : 0);
			
 
				             return text != null ? text : "";
			
 
				         }
			
 
				     }