Просмотр исходного кода

refactor: 移除页码定位功能,简化文档处理

根据设计原型,不需要页码定位功能:
- 删除 DocumentIndexService(生成页面/行索引)
- 删除 PositionMappingService(字符位置到页码映射)
- 简化 PdfTextExtractionService,移除索引生成
- 简化 WordTextExtractionService,移除分页符检测
- 简化 GraphNerService,直接使用原始字符偏移

实体位置信息仅保留 charStart/charEnd 字符偏移
何文松 1 месяц назад
Родитель
Сommit
fe3056fbb3

+ 3 - 69
backend/graph-service/src/main/java/com/lingyue/graph/service/GraphNerService.java

@@ -34,7 +34,6 @@ public class GraphNerService {
     private final TextStorageRepository textStorageRepository;
     private final GraphNodeRepository graphNodeRepository;
     private final GraphRelationRepository graphRelationRepository;
-    private final PositionMappingService positionMappingService;
 
     /**
      * 获取文档的文本内容
@@ -99,17 +98,13 @@ public class GraphNerService {
             node.setCreateTime(new Date());
             node.setUpdateTime(new Date());
             
-            // 转换位置信息并补充页码/行号
+            // 转换位置信息(直接使用字符偏移)
             Object positionObj = entity.get("position");
             if (positionObj instanceof Map) {
                 @SuppressWarnings("unchecked")
                 Map<String, Object> posMap = (Map<String, Object>) positionObj;
-                
-                // 使用 PositionMappingService 补充页码和行号
-                Map<String, Object> enrichedPosition = enrichPosition(documentId, posMap);
-                
-                log.debug("实体位置信息: name={}, position={}", node.getName(), enrichedPosition);
-                node.setPosition(enrichedPosition);
+                log.debug("实体位置信息: name={}, position={}", node.getName(), posMap);
+                node.setPosition(posMap);
             } else {
                 log.debug("实体无位置信息: name={}, positionObj={}", node.getName(), positionObj);
             }
@@ -294,67 +289,6 @@ public class GraphNerService {
         }
     }
 
-    /**
-     * 丰富位置信息,补充页码和行号
-     * 
-     * @param documentId 文档ID
-     * @param posMap 原始位置信息(包含 charStart, charEnd)
-     * @return 丰富后的位置信息(包含 charStart, charEnd, page, line)
-     */
-    private Map<String, Object> enrichPosition(String documentId, Map<String, Object> posMap) {
-        // 如果已经有页码和行号,直接返回
-        if (posMap.containsKey("page") && posMap.get("page") != null 
-            && posMap.containsKey("line") && posMap.get("line") != null) {
-            log.debug("位置信息已完整,跳过映射: documentId={}", documentId);
-            return posMap;
-        }
-        
-        // 获取字符位置
-        Integer charStart = getIntValue(posMap, "charStart");
-        Integer charEnd = getIntValue(posMap, "charEnd");
-        
-        if (charStart == null || charEnd == null) {
-            log.debug("缺少字符位置信息,跳过映射: documentId={}, posMap={}", documentId, posMap);
-            return posMap;
-        }
-        
-        try {
-            // 使用 PositionMappingService 映射页码和行号
-            log.debug("开始位置映射: documentId={}, charStart={}, charEnd={}", documentId, charStart, charEnd);
-            Map<String, Object> mappedPosition = positionMappingService.mapCharToPosition(
-                documentId, charStart, charEnd);
-            
-            // 合并原始位置信息和映射结果
-            Map<String, Object> enrichedPosition = new HashMap<>(posMap);
-            enrichedPosition.putAll(mappedPosition);
-            
-            log.debug("位置映射完成: documentId={}, result={}", documentId, enrichedPosition);
-            return enrichedPosition;
-        } catch (Exception e) {
-            log.warn("位置映射失败: documentId={}, charStart={}, charEnd={}, error={}", 
-                    documentId, charStart, charEnd, e.getMessage());
-            return posMap;
-        }
-    }
-    
-    /**
-     * 从 Map 中获取整数值
-     */
-    private Integer getIntValue(Map<String, Object> map, String key) {
-        Object value = map.get(key);
-        if (value == null) {
-            return null;
-        }
-        if (value instanceof Number) {
-            return ((Number) value).intValue();
-        }
-        try {
-            return Integer.parseInt(value.toString());
-        } catch (NumberFormatException e) {
-            return null;
-        }
-    }
-    
     /**
      * 从 Map 中获取字符串值
      */

+ 0 - 337
backend/graph-service/src/main/java/com/lingyue/graph/service/PositionMappingService.java

@@ -1,337 +0,0 @@
-package com.lingyue.graph.service;
-
-import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import lombok.Data;
-import lombok.extern.slf4j.Slf4j;
-import org.springframework.beans.factory.annotation.Value;
-import org.springframework.stereotype.Service;
-
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
-
-/**
- * 位置映射服务
- * 根据文档索引将字符位置映射到页码和行号
- * 
- * @author lingyue
- * @since 2026-01-20
- */
-@Slf4j
-@Service
-public class PositionMappingService {
-    
-    private final ObjectMapper objectMapper;
-    
-    @Value("${file.storage.text-path:/data/lingyue/texts}")
-    private String textStoragePath;
-    
-    // 索引缓存,避免重复读取文件
-    private final Map<String, DocumentIndex> indexCache = new ConcurrentHashMap<>();
-    
-    public PositionMappingService(ObjectMapper objectMapper) {
-        this.objectMapper = objectMapper;
-    }
-    
-    /**
-     * 根据字符位置获取完整位置信息(包含页码和行号)
-     * 
-     * @param documentId 文档ID
-     * @param charStart 字符起始位置
-     * @param charEnd 字符结束位置
-     * @return 位置信息 Map(包含 charStart, charEnd, page, line)
-     */
-    public Map<String, Object> mapCharToPosition(String documentId, int charStart, int charEnd) {
-        Map<String, Object> position = new HashMap<>();
-        position.put("charStart", charStart);
-        position.put("charEnd", charEnd);
-        
-        // 尝试加载文档索引
-        DocumentIndex index = loadDocumentIndex(documentId);
-        if (index == null) {
-            log.debug("未找到文档索引,返回仅包含字符位置的信息: documentId={}", documentId);
-            return position;
-        }
-        
-        // 查找页码
-        int page = findPage(index, charStart);
-        position.put("page", page);
-        
-        // 查找行号
-        int line = findLine(index, charStart);
-        position.put("line", line);
-        
-        // 计算全局行号(如果有行索引)
-        if (index.getLines() != null && index.getLines().length > 0) {
-            position.put("globalLine", line);
-        }
-        
-        return position;
-    }
-    
-    /**
-     * 根据页码和行号获取字符位置范围
-     * 
-     * @param documentId 文档ID
-     * @param page 页码
-     * @param line 行号(页内行号)
-     * @return 字符位置范围 [charStart, charEnd] 或 null
-     */
-    public int[] mapPageLineToChar(String documentId, int page, int line) {
-        DocumentIndex index = loadDocumentIndex(documentId);
-        if (index == null || index.getPages() == null) {
-            return null;
-        }
-        
-        // 找到对应页
-        PageIndex pageIndex = null;
-        for (PageIndex p : index.getPages()) {
-            if (p.getPage() == page) {
-                pageIndex = p;
-                break;
-            }
-        }
-        
-        if (pageIndex == null) {
-            return null;
-        }
-        
-        // 如果有行索引,找到具体行
-        if (index.getLines() != null) {
-            // 计算目标全局行号
-            int targetGlobalLine = pageIndex.getLineStart() + line - 1;
-            for (LineIndex lineIndex : index.getLines()) {
-                if (lineIndex.getLine() == targetGlobalLine) {
-                    return new int[]{lineIndex.getCharStart(), lineIndex.getCharEnd()};
-                }
-            }
-        }
-        
-        // 没有行索引,返回页的范围
-        return new int[]{pageIndex.getCharStart(), pageIndex.getCharEnd()};
-    }
-    
-    /**
-     * 批量映射位置信息
-     * 用于一次性处理多个实体的位置
-     * 
-     * @param documentId 文档ID
-     * @param charPositions 字符位置列表 [[charStart1, charEnd1], [charStart2, charEnd2], ...]
-     * @return 完整位置信息列表
-     */
-    public Map<String, Object>[] mapCharToPositionBatch(String documentId, int[][] charPositions) {
-        // 预加载索引
-        DocumentIndex index = loadDocumentIndex(documentId);
-        
-        @SuppressWarnings("unchecked")
-        Map<String, Object>[] results = new Map[charPositions.length];
-        
-        for (int i = 0; i < charPositions.length; i++) {
-            int charStart = charPositions[i][0];
-            int charEnd = charPositions[i][1];
-            
-            Map<String, Object> position = new HashMap<>();
-            position.put("charStart", charStart);
-            position.put("charEnd", charEnd);
-            
-            if (index != null) {
-                position.put("page", findPage(index, charStart));
-                position.put("line", findLine(index, charStart));
-            }
-            
-            results[i] = position;
-        }
-        
-        return results;
-    }
-    
-    /**
-     * 清除缓存的索引
-     */
-    public void clearCache(String documentId) {
-        indexCache.remove(documentId);
-    }
-    
-    /**
-     * 清除所有缓存
-     */
-    public void clearAllCache() {
-        indexCache.clear();
-    }
-    
-    /**
-     * 加载文档索引
-     */
-    private DocumentIndex loadDocumentIndex(String documentId) {
-        // 先检查缓存
-        if (indexCache.containsKey(documentId)) {
-            log.debug("从缓存加载索引: documentId={}", documentId);
-            return indexCache.get(documentId);
-        }
-        
-        // 构建索引文件路径
-        String indexFilePath = buildIndexFilePath(documentId);
-        Path path = Path.of(indexFilePath);
-        
-        log.debug("尝试加载索引文件: {}", indexFilePath);
-        
-        if (!Files.exists(path)) {
-            log.info("索引文件不存在,无法补充页码信息: {}", indexFilePath);
-            return null;
-        }
-        
-        try {
-            String json = Files.readString(path);
-            JsonNode root = objectMapper.readTree(json);
-            
-            DocumentIndex index = new DocumentIndex();
-            index.setDocumentId(root.path("documentId").asText(documentId));
-            index.setTotalChars(root.path("totalChars").asInt(0));
-            index.setTotalLines(root.path("totalLines").asInt(0));
-            index.setTotalPages(root.path("totalPages").asInt(0));
-            
-            // 解析页面索引
-            JsonNode pagesNode = root.path("pages");
-            if (pagesNode.isArray()) {
-                PageIndex[] pages = new PageIndex[pagesNode.size()];
-                for (int i = 0; i < pagesNode.size(); i++) {
-                    JsonNode pageNode = pagesNode.get(i);
-                    PageIndex pageIndex = new PageIndex();
-                    pageIndex.setPage(pageNode.path("page").asInt(i + 1));
-                    pageIndex.setCharStart(pageNode.path("charStart").asInt(0));
-                    pageIndex.setCharEnd(pageNode.path("charEnd").asInt(0));
-                    pageIndex.setLineStart(pageNode.path("lineStart").asInt(1));
-                    pageIndex.setLineEnd(pageNode.path("lineEnd").asInt(1));
-                    pages[i] = pageIndex;
-                }
-                index.setPages(pages);
-            }
-            
-            // 解析行索引
-            JsonNode linesNode = root.path("lines");
-            if (linesNode.isArray()) {
-                LineIndex[] lines = new LineIndex[linesNode.size()];
-                for (int i = 0; i < linesNode.size(); i++) {
-                    JsonNode lineNode = linesNode.get(i);
-                    LineIndex lineIndex = new LineIndex();
-                    lineIndex.setLine(lineNode.path("line").asInt(i + 1));
-                    lineIndex.setCharStart(lineNode.path("charStart").asInt(0));
-                    lineIndex.setCharEnd(lineNode.path("charEnd").asInt(0));
-                    lines[i] = lineIndex;
-                }
-                index.setLines(lines);
-            }
-            
-            // 缓存
-            indexCache.put(documentId, index);
-            log.debug("已加载并缓存文档索引: documentId={}, pages={}, lines={}", 
-                    documentId, 
-                    index.getPages() != null ? index.getPages().length : 0,
-                    index.getLines() != null ? index.getLines().length : 0);
-            
-            return index;
-        } catch (Exception e) {
-            log.error("加载文档索引失败: documentId={}", documentId, e);
-            return null;
-        }
-    }
-    
-    /**
-     * 构建索引文件路径
-     */
-    private String buildIndexFilePath(String documentId) {
-        return Path.of(
-                textStoragePath,
-                documentId.substring(0, 2),
-                documentId + "_index.json"
-        ).toString();
-    }
-    
-    /**
-     * 使用二分查找页码
-     */
-    private int findPage(DocumentIndex index, int charPosition) {
-        if (index.getPages() == null || index.getPages().length == 0) {
-            return 1;
-        }
-        
-        for (PageIndex page : index.getPages()) {
-            if (charPosition >= page.getCharStart() && charPosition <= page.getCharEnd()) {
-                return page.getPage();
-            }
-        }
-        
-        // 如果未找到,返回最后一页
-        return index.getPages()[index.getPages().length - 1].getPage();
-    }
-    
-    /**
-     * 使用二分查找行号
-     */
-    private int findLine(DocumentIndex index, int charPosition) {
-        if (index.getLines() == null || index.getLines().length == 0) {
-            return 1;
-        }
-        
-        // 二分查找
-        int left = 0;
-        int right = index.getLines().length - 1;
-        
-        while (left <= right) {
-            int mid = (left + right) / 2;
-            LineIndex line = index.getLines()[mid];
-            
-            if (charPosition < line.getCharStart()) {
-                right = mid - 1;
-            } else if (charPosition > line.getCharEnd()) {
-                left = mid + 1;
-            } else {
-                return line.getLine();
-            }
-        }
-        
-        // 如果未找到,返回最近的行
-        if (left >= index.getLines().length) {
-            return index.getLines()[index.getLines().length - 1].getLine();
-        }
-        return index.getLines()[left].getLine();
-    }
-    
-    /**
-     * 文档索引
-     */
-    @Data
-    public static class DocumentIndex {
-        private String documentId;
-        private PageIndex[] pages;
-        private LineIndex[] lines;
-        private int totalChars;
-        private int totalLines;
-        private int totalPages;
-    }
-    
-    /**
-     * 页面索引
-     */
-    @Data
-    public static class PageIndex {
-        private int page;
-        private int charStart;
-        private int charEnd;
-        private int lineStart;
-        private int lineEnd;
-    }
-    
-    /**
-     * 行索引
-     */
-    @Data
-    public static class LineIndex {
-        private int line;
-        private int charStart;
-        private int charEnd;
-    }
-}

+ 0 - 313
backend/parse-service/src/main/java/com/lingyue/parse/service/DocumentIndexService.java

@@ -1,313 +0,0 @@
-package com.lingyue.parse.service;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-import lombok.Data;
-import lombok.RequiredArgsConstructor;
-import lombok.extern.slf4j.Slf4j;
-import org.springframework.stereotype.Service;
-
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * 文档索引服务
- * 负责生成和管理文档的位置索引
- * 
- * @author lingyue
- * @since 2026-01-20
- */
-@Slf4j
-@Service
-@RequiredArgsConstructor
-public class DocumentIndexService {
-    
-    private final ObjectMapper objectMapper;
-    
-    /**
-     * 分页符字符(Form Feed)
-     */
-    private static final char FORM_FEED = '\f';
-    
-    /**
-     * 为纯文本生成行索引(Word/Excel等文档)
-     * 如果文本中包含分页符(\f),则根据分页符生成页面索引
-     * 
-     * @param text 文本内容
-     * @param documentId 文档ID
-     * @param indexOutputPath 索引输出路径
-     * @return 文档索引
-     */
-    public DocumentIndex generateLineIndex(String text, String documentId, String indexOutputPath) {
-        if (text == null || text.isEmpty()) {
-            return createEmptyIndex(documentId);
-        }
-        
-        // 检查是否包含分页符
-        boolean hasPageBreaks = text.indexOf(FORM_FEED) >= 0;
-        
-        if (hasPageBreaks) {
-            log.info("检测到分页符,生成分页索引: documentId={}", documentId);
-            return generateIndexWithPageBreaks(text, documentId, indexOutputPath);
-        } else {
-            log.debug("无分页符,生成单页索引: documentId={}", documentId);
-            return generateSinglePageIndex(text, documentId, indexOutputPath);
-        }
-    }
-    
-    /**
-     * 根据分页符生成多页索引
-     */
-    private DocumentIndex generateIndexWithPageBreaks(String text, String documentId, String indexOutputPath) {
-        List<PageIndex> pageIndices = new ArrayList<>();
-        List<LineIndex> lineIndices = new ArrayList<>();
-        
-        int charPos = 0;
-        int lineNum = 1;
-        int pageNum = 1;
-        int pageCharStart = 0;
-        int pageLineStart = 1;
-        
-        String[] lines = text.split("\n", -1);
-        
-        for (String line : lines) {
-            // 检查这一行是否包含分页符
-            int ffIndex = line.indexOf(FORM_FEED);
-            
-            if (ffIndex >= 0) {
-                // 处理分页符之前的内容
-                if (ffIndex > 0) {
-                    LineIndex lineIndex = new LineIndex();
-                    lineIndex.setLine(lineNum);
-                    lineIndex.setCharStart(charPos);
-                    lineIndex.setCharEnd(charPos + ffIndex);
-                    lineIndices.add(lineIndex);
-                }
-                
-                // 结束当前页
-                PageIndex pageIndex = new PageIndex();
-                pageIndex.setPage(pageNum);
-                pageIndex.setCharStart(pageCharStart);
-                pageIndex.setCharEnd(charPos + ffIndex);
-                pageIndex.setLineStart(pageLineStart);
-                pageIndex.setLineEnd(lineNum);
-                pageIndices.add(pageIndex);
-                
-                // 开始新页
-                pageNum++;
-                pageCharStart = charPos + ffIndex + 1; // +1 跳过分页符
-                pageLineStart = lineNum + 1;
-                
-                // 处理分页符之后的内容(如果有)
-                if (ffIndex + 1 < line.length()) {
-                    lineNum++;
-                    LineIndex afterLineIndex = new LineIndex();
-                    afterLineIndex.setLine(lineNum);
-                    afterLineIndex.setCharStart(charPos + ffIndex + 1);
-                    afterLineIndex.setCharEnd(charPos + line.length());
-                    lineIndices.add(afterLineIndex);
-                }
-            } else {
-                // 普通行
-                LineIndex lineIndex = new LineIndex();
-                lineIndex.setLine(lineNum);
-                lineIndex.setCharStart(charPos);
-                lineIndex.setCharEnd(charPos + line.length());
-                lineIndices.add(lineIndex);
-            }
-            
-            charPos += line.length() + 1; // +1 for \n
-            lineNum++;
-        }
-        
-        // 添加最后一页
-        if (pageCharStart < text.length()) {
-            PageIndex lastPage = new PageIndex();
-            lastPage.setPage(pageNum);
-            lastPage.setCharStart(pageCharStart);
-            lastPage.setCharEnd(text.length());
-            lastPage.setLineStart(pageLineStart);
-            lastPage.setLineEnd(lineNum - 1);
-            pageIndices.add(lastPage);
-        }
-        
-        // 创建文档索引
-        DocumentIndex index = new DocumentIndex();
-        index.setDocumentId(documentId);
-        index.setTotalChars(text.length());
-        index.setTotalLines(lineNum - 1);
-        index.setTotalPages(pageIndices.size());
-        index.setPages(pageIndices);
-        index.setLines(lineIndices);
-        
-        log.info("分页索引生成完成: documentId={}, pages={}, lines={}", 
-                documentId, pageIndices.size(), lineIndices.size());
-        
-        // 保存索引文件
-        if (indexOutputPath != null) {
-            saveIndexFile(index, indexOutputPath);
-        }
-        
-        return index;
-    }
-    
-    /**
-     * 生成单页索引(无分页符的文档)
-     */
-    private DocumentIndex generateSinglePageIndex(String text, String documentId, String indexOutputPath) {
-        List<LineIndex> lineIndices = new ArrayList<>();
-        int charPos = 0;
-        int lineNum = 1;
-        
-        String[] lines = text.split("\n", -1);
-        for (String line : lines) {
-            LineIndex lineIndex = new LineIndex();
-            lineIndex.setLine(lineNum);
-            lineIndex.setCharStart(charPos);
-            lineIndex.setCharEnd(charPos + line.length());
-            lineIndices.add(lineIndex);
-            
-            charPos += line.length() + 1; // +1 for \n
-            lineNum++;
-        }
-        
-        // 创建文档索引(无分页,只有行索引)
-        DocumentIndex index = new DocumentIndex();
-        index.setDocumentId(documentId);
-        index.setTotalChars(text.length());
-        index.setTotalLines(lines.length);
-        index.setTotalPages(1); // 非分页文档统一为1页
-        index.setLines(lineIndices);
-        
-        // 创建虚拟的单页索引
-        List<PageIndex> pages = new ArrayList<>();
-        PageIndex singlePage = new PageIndex();
-        singlePage.setPage(1);
-        singlePage.setCharStart(0);
-        singlePage.setCharEnd(text.length());
-        singlePage.setLineStart(1);
-        singlePage.setLineEnd(lines.length);
-        pages.add(singlePage);
-        index.setPages(pages);
-        
-        // 保存索引文件
-        if (indexOutputPath != null) {
-            saveIndexFile(index, indexOutputPath);
-        }
-        
-        return index;
-    }
-    
-    /**
-     * 读取索引文件
-     */
-    public DocumentIndex loadIndex(String indexFilePath) {
-        try {
-            Path path = Paths.get(indexFilePath);
-            if (!Files.exists(path)) {
-                log.warn("索引文件不存在: {}", indexFilePath);
-                return null;
-            }
-            String json = Files.readString(path);
-            return objectMapper.readValue(json, DocumentIndex.class);
-        } catch (Exception e) {
-            log.error("读取索引文件失败: {}", indexFilePath, e);
-            return null;
-        }
-    }
-    
-    /**
-     * 根据字符位置查找页码和行号
-     * 
-     * @param index 文档索引
-     * @param charPosition 字符位置
-     * @return [页码, 行号] 或 null(未找到)
-     */
-    public int[] findPageAndLine(DocumentIndex index, int charPosition) {
-        if (index == null || index.getPages() == null) {
-            return null;
-        }
-        
-        // 1. 找到所在页
-        int page = 1;
-        for (PageIndex pageIndex : index.getPages()) {
-            if (charPosition >= pageIndex.getCharStart() && charPosition <= pageIndex.getCharEnd()) {
-                page = pageIndex.getPage();
-                break;
-            }
-        }
-        
-        // 2. 找到所在行
-        int line = 1;
-        if (index.getLines() != null) {
-            for (LineIndex lineIndex : index.getLines()) {
-                if (charPosition >= lineIndex.getCharStart() && charPosition <= lineIndex.getCharEnd()) {
-                    line = lineIndex.getLine();
-                    break;
-                }
-            }
-        }
-        
-        return new int[]{page, line};
-    }
-    
-    private DocumentIndex createEmptyIndex(String documentId) {
-        DocumentIndex index = new DocumentIndex();
-        index.setDocumentId(documentId);
-        index.setTotalChars(0);
-        index.setTotalLines(0);
-        index.setTotalPages(0);
-        index.setPages(new ArrayList<>());
-        return index;
-    }
-    
-    private void saveIndexFile(DocumentIndex index, String outputPath) {
-        try {
-            Path path = Paths.get(outputPath);
-            Files.createDirectories(path.getParent());
-            String json = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(index);
-            Files.writeString(path, json);
-            log.info("文档索引文件已保存: {}", outputPath);
-        } catch (Exception e) {
-            log.error("保存文档索引文件失败: {}", outputPath, e);
-        }
-    }
-    
-    /**
-     * 文档索引
-     */
-    @Data
-    public static class DocumentIndex {
-        private String documentId;
-        private List<PageIndex> pages;
-        private List<LineIndex> lines;
-        private int totalChars;
-        private int totalLines;
-        private int totalPages;
-    }
-    
-    /**
-     * 页面索引
-     */
-    @Data
-    public static class PageIndex {
-        private int page;
-        private int charStart;
-        private int charEnd;
-        private int lineStart;
-        private int lineEnd;
-        private boolean ocrUsed;
-    }
-    
-    /**
-     * 行索引
-     */
-    @Data
-    public static class LineIndex {
-        private int line;
-        private int charStart;
-        private int charEnd;
-    }
-}

+ 5 - 29
backend/parse-service/src/main/java/com/lingyue/parse/service/ParseService.java

@@ -32,7 +32,6 @@ public class ParseService {
     private final ExcelTextExtractionService excelTextExtractionService;
     private final OcrResultParser ocrResultParser;
     private final LayoutAnalysisService layoutAnalysisService;
-    private final DocumentIndexService documentIndexService;
     private final FileStorageProperties fileStorageProperties;
     // 单体应用直接注入 Service,不使用 Feign Client
     private final com.lingyue.graph.service.TextStorageService textStorageService;
@@ -97,12 +96,9 @@ public class ParseService {
                 task.setProgress(20);
                 saveParseTask(task);
                 
-                // PDF使用分页判断逻辑,并生成页面索引
-                String indexFilePath = buildIndexFilePath(documentId);
-                PdfTextExtractionService.ExtractionResult extractionResult = 
-                    pdfTextExtractionService.extractTextWithIndex(sourceFilePath, documentId, indexFilePath);
-                plainText = extractionResult.getText();
-                log.info("PDF提取完成,索引文件: {}", indexFilePath);
+                // PDF使用分页判断逻辑
+                plainText = pdfTextExtractionService.extractText(sourceFilePath);
+                log.info("PDF提取完成,文本长度: {}", plainText.length());
             } else if (fileType == FileType.WORD || fileType == FileType.WORD_OLD) {
                 log.info("处理Word文件: {}", sourceFilePath);
                 task.setCurrentStep("word_extraction");
@@ -111,11 +107,7 @@ public class ParseService {
                 
                 // Word文档直接提取文本
                 plainText = wordTextExtractionService.extractText(sourceFilePath);
-                
-                // 为Word生成行索引
-                String indexFilePath = buildIndexFilePath(documentId);
-                documentIndexService.generateLineIndex(plainText, documentId, indexFilePath);
-                log.info("Word提取完成,索引文件: {}", indexFilePath);
+                log.info("Word提取完成,文本长度: {}", plainText.length());
             } else if (fileType == FileType.EXCEL || fileType == FileType.EXCEL_OLD) {
                 log.info("处理Excel文件: {}", sourceFilePath);
                 task.setCurrentStep("excel_extraction");
@@ -124,11 +116,7 @@ public class ParseService {
                 
                 // Excel表格直接提取文本
                 plainText = excelTextExtractionService.extractText(sourceFilePath);
-                
-                // 为Excel生成行索引
-                String indexFilePath = buildIndexFilePath(documentId);
-                documentIndexService.generateLineIndex(plainText, documentId, indexFilePath);
-                log.info("Excel提取完成,索引文件: {}", indexFilePath);
+                log.info("Excel提取完成,文本长度: {}", plainText.length());
             } else if (fileType.isImage()) {
                 log.info("处理图片文件: {}", sourceFilePath);
                 task.setCurrentStep("ocr");
@@ -283,18 +271,6 @@ public class ParseService {
         return path.toString();
     }
     
-    /**
-     * 根据文档ID构建索引文件存储路径
-     */
-    private String buildIndexFilePath(String documentId) {
-        Path path = Path.of(
-                fileStorageProperties.getTextPath(),
-                documentId.substring(0, 2),
-                documentId + "_index.json"
-        );
-        return path.toString();
-    }
-
     /**
      * 将纯文本写入 TXT 文件
      * 对于大文件使用分块写入,避免内存溢出

+ 0 - 213
backend/parse-service/src/main/java/com/lingyue/parse/service/PdfTextExtractionService.java

@@ -1,8 +1,6 @@
 package com.lingyue.parse.service;
 
-import com.fasterxml.jackson.databind.ObjectMapper;
 import com.lingyue.common.exception.ServiceException;
-import lombok.Data;
 import lombok.RequiredArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
 import org.apache.pdfbox.pdmodel.PDDocument;
@@ -16,7 +14,6 @@ import java.io.File;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.UUID;
@@ -35,7 +32,6 @@ public class PdfTextExtractionService {
     
     private final PaddleOcrClient paddleOcrClient;
     private final OcrResultParser ocrResultParser;
-    private final ObjectMapper objectMapper;
     
     /**
      * 文本阈值:每页至少需要这么多字符才认为有文本层
@@ -209,170 +205,6 @@ public class PdfTextExtractionService {
         return combinedText.toString();
     }
     
-    /**
-     * 提取PDF文本并生成页面索引
-     * 
-     * @param pdfFilePath PDF文件路径
-     * @param documentId 文档ID
-     * @param indexOutputPath 索引文件输出路径(如果为null则不生成索引文件)
-     * @return 提取结果(包含文本和索引)
-     */
-    public ExtractionResult extractTextWithIndex(String pdfFilePath, String documentId, String indexOutputPath) {
-        File pdfFile = new File(pdfFilePath);
-        if (!pdfFile.exists()) {
-            throw new ServiceException("PDF文件不存在: " + pdfFilePath);
-        }
-        
-        List<PageTextResult> pageResults = new ArrayList<>();
-        
-        try (PDDocument document = org.apache.pdfbox.Loader.loadPDF(pdfFile)) {
-            int totalPages = document.getNumberOfPages();
-            log.info("开始处理PDF文件(带索引): {}, 总页数: {}", pdfFilePath, totalPages);
-            
-            PDFTextStripper textStripper = new PDFTextStripper();
-            
-            // 逐页处理
-            for (int pageNum = 1; pageNum <= totalPages; pageNum++) {
-                log.debug("处理第 {} 页/共 {} 页", pageNum, totalPages);
-                
-                try {
-                    textStripper.setStartPage(pageNum);
-                    textStripper.setEndPage(pageNum);
-                    String pageText = textStripper.getText(document);
-                    
-                    if (hasSufficientText(pageText)) {
-                        log.debug("第 {} 页有文本层,直接使用,文本长度: {}", pageNum, pageText.length());
-                        pageResults.add(new PageTextResult(pageNum, pageText, false));
-                    } else {
-                        log.debug("第 {} 页文本不足,使用OCR处理", pageNum);
-                        String ocrText = extractTextByOcr(pdfFilePath, pageNum);
-                        pageResults.add(new PageTextResult(pageNum, ocrText, true));
-                    }
-                } catch (Exception e) {
-                    log.error("处理第 {} 页时出错,尝试使用OCR", pageNum, e);
-                    try {
-                        String ocrText = extractTextByOcr(pdfFilePath, pageNum);
-                        pageResults.add(new PageTextResult(pageNum, ocrText, true));
-                    } catch (Exception ocrException) {
-                        log.error("第 {} 页OCR也失败", pageNum, ocrException);
-                        pageResults.add(new PageTextResult(pageNum, "", true));
-                    }
-                }
-            }
-        } catch (IOException e) {
-            log.error("读取PDF文件失败: {}", pdfFilePath, e);
-            throw new ServiceException("读取PDF文件失败: " + e.getMessage());
-        }
-        
-        // 生成带索引的合并结果
-        return combinePageTextsWithIndex(pageResults, documentId, indexOutputPath);
-    }
-    
-    /**
-     * 合并页面文本并生成索引
-     */
-    private ExtractionResult combinePageTextsWithIndex(List<PageTextResult> pageResults, 
-                                                        String documentId, 
-                                                        String indexOutputPath) {
-        StringBuilder combinedText = new StringBuilder();
-        List<PageIndex> pageIndices = new ArrayList<>();
-        List<LineIndex> lineIndices = new ArrayList<>();
-        
-        int currentCharPos = 0;
-        int currentLine = 1;
-        
-        for (PageTextResult result : pageResults) {
-            int pageCharStart = currentCharPos;
-            int pageLineStart = currentLine;
-            
-            if (result.getText() != null && !result.getText().trim().isEmpty()) {
-                // 添加页头标记
-                String pageHeader = "=== 第 " + result.getPageNum() + " 页";
-                if (result.isOcrUsed()) {
-                    pageHeader += " (OCR识别)";
-                } else {
-                    pageHeader += " (文本层提取)";
-                }
-                pageHeader += " ===\n";
-                
-                // 记录页头行的索引
-                LineIndex headerLineIndex = new LineIndex();
-                headerLineIndex.setLine(currentLine);
-                headerLineIndex.setCharStart(currentCharPos);
-                headerLineIndex.setCharEnd(currentCharPos + pageHeader.length() - 2); // -2 去掉 \n
-                lineIndices.add(headerLineIndex);
-                
-                combinedText.append(pageHeader);
-                currentCharPos += pageHeader.length();
-                currentLine++;
-                
-                // 添加页面内容并记录每行索引
-                String pageText = result.getText();
-                String[] lines = pageText.split("\n", -1);
-                for (String line : lines) {
-                    LineIndex lineIndex = new LineIndex();
-                    lineIndex.setLine(currentLine);
-                    lineIndex.setCharStart(currentCharPos);
-                    lineIndex.setCharEnd(currentCharPos + line.length());
-                    lineIndices.add(lineIndex);
-                    
-                    currentCharPos += line.length() + 1; // +1 for \n
-                    currentLine++;
-                }
-                
-                // 添加额外的空行分隔符
-                combinedText.append(pageText).append("\n\n");
-                currentCharPos++; // 额外的 \n
-                currentLine++;
-            }
-            
-            // 创建页面索引
-            PageIndex pageIndex = new PageIndex();
-            pageIndex.setPage(result.getPageNum());
-            pageIndex.setCharStart(pageCharStart);
-            pageIndex.setCharEnd(currentCharPos - 1);
-            pageIndex.setLineStart(pageLineStart);
-            pageIndex.setLineEnd(currentLine - 1);
-            pageIndex.setOcrUsed(result.isOcrUsed());
-            pageIndices.add(pageIndex);
-        }
-        
-        // 创建文档索引
-        DocumentIndex documentIndex = new DocumentIndex();
-        documentIndex.setDocumentId(documentId);
-        documentIndex.setPages(pageIndices);
-        documentIndex.setLines(lineIndices);
-        documentIndex.setTotalChars(currentCharPos);
-        documentIndex.setTotalLines(currentLine - 1);
-        documentIndex.setTotalPages(pageResults.size());
-        
-        // 保存索引文件
-        if (indexOutputPath != null) {
-            saveIndexFile(documentIndex, indexOutputPath);
-        }
-        
-        ExtractionResult result = new ExtractionResult();
-        result.setText(combinedText.toString());
-        result.setIndex(documentIndex);
-        return result;
-    }
-    
-    /**
-     * 保存索引文件
-     */
-    private void saveIndexFile(DocumentIndex index, String outputPath) {
-        try {
-            Path path = Paths.get(outputPath);
-            Files.createDirectories(path.getParent());
-            String json = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(index);
-            Files.writeString(path, json);
-            log.info("页面索引文件已保存: {}", outputPath);
-        } catch (Exception e) {
-            log.error("保存页面索引文件失败: {}", outputPath, e);
-            // 索引保存失败不影响主流程
-        }
-    }
-    
     /**
      * 页面文本结果
      */
@@ -399,49 +231,4 @@ public class PdfTextExtractionService {
             return ocrUsed;
         }
     }
-    
-    /**
-     * 提取结果(包含文本和索引)
-     */
-    @Data
-    public static class ExtractionResult {
-        private String text;
-        private DocumentIndex index;
-    }
-    
-    /**
-     * 文档索引
-     */
-    @Data
-    public static class DocumentIndex {
-        private String documentId;
-        private List<PageIndex> pages;
-        private List<LineIndex> lines;
-        private int totalChars;
-        private int totalLines;
-        private int totalPages;
-    }
-    
-    /**
-     * 行索引
-     */
-    @Data
-    public static class LineIndex {
-        private int line;
-        private int charStart;
-        private int charEnd;
-    }
-    
-    /**
-     * 页面索引
-     */
-    @Data
-    public static class PageIndex {
-        private int page;
-        private int charStart;
-        private int charEnd;
-        private int lineStart;
-        private int lineEnd;
-        private boolean ocrUsed;
-    }
 }

+ 13 - 91
backend/parse-service/src/main/java/com/lingyue/parse/service/WordTextExtractionService.java

@@ -4,8 +4,8 @@ import com.lingyue.common.exception.ServiceException;
 import lombok.extern.slf4j.Slf4j;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.extractor.WordExtractor;
-import org.apache.poi.xwpf.usermodel.*;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
 import org.springframework.stereotype.Service;
 
 import java.io.File;
@@ -15,7 +15,6 @@ import java.io.IOException;
 /**
  * Word文档文本提取服务
  * 支持.docx和.doc格式
- * 在分页符位置插入 \f 字符,便于后续识别页码
  * 
  * @author lingyue
  * @since 2026-01-14
@@ -24,16 +23,11 @@ import java.io.IOException;
 @Service
 public class WordTextExtractionService {
     
-    /**
-     * 分页符字符
-     */
-    private static final char PAGE_BREAK = '\f';
-    
     /**
      * 提取Word文档文本
      * 
      * @param wordFilePath Word文件路径
-     * @return 提取的文本内容(分页符位置插入 \f)
+     * @return 提取的文本内容
      */
     public String extractText(String wordFilePath) {
         File wordFile = new File(wordFilePath);
@@ -45,7 +39,7 @@ public class WordTextExtractionService {
         
         try {
             if (fileName.endsWith(".docx")) {
-                return extractFromDocxWithPageBreaks(wordFilePath);
+                return extractFromDocx(wordFilePath);
             } else if (fileName.endsWith(".doc")) {
                 return extractFromDoc(wordFilePath);
             } else {
@@ -58,88 +52,23 @@ public class WordTextExtractionService {
     }
     
     /**
-     * 从.docx文件提取文本,保留分页符
+     * 从.docx文件提取文本
      */
-    private String extractFromDocxWithPageBreaks(String filePath) throws IOException {
-        log.info("提取.docx文件文本(含分页符): {}", filePath);
-        
-        StringBuilder sb = new StringBuilder();
-        int pageBreakCount = 0;
+    private String extractFromDocx(String filePath) throws IOException {
+        log.info("提取.docx文件文本: {}", filePath);
         
         try (FileInputStream fis = new FileInputStream(filePath);
-             XWPFDocument document = new XWPFDocument(fis)) {
-            
-            // 遍历文档主体的所有元素
-            for (IBodyElement element : document.getBodyElements()) {
-                if (element instanceof XWPFParagraph) {
-                    XWPFParagraph paragraph = (XWPFParagraph) element;
-                    
-                    // 检查段落中的分页符
-                    for (XWPFRun run : paragraph.getRuns()) {
-                        // 检查 run 中是否有分页符
-                        CTR ctr = run.getCTR();
-                        if (ctr != null) {
-                            // 检查是否有硬分页符 (page break)
-                            if (ctr.getBrList() != null) {
-                                for (org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBr br : ctr.getBrList()) {
-                                    if (br.getType() == org.openxmlformats.schemas.wordprocessingml.x2006.main.STBrType.PAGE) {
-                                        sb.append(PAGE_BREAK);
-                                        pageBreakCount++;
-                                    }
-                                }
-                            }
-                        }
-                        
-                        // 添加文本内容
-                        String text = run.getText(0);
-                        if (text != null) {
-                            sb.append(text);
-                        }
-                    }
-                    
-                    // 检查段落后是否有分页符(通过段落属性)
-                    if (paragraph.getCTP() != null && paragraph.getCTP().getPPr() != null) {
-                        var pPr = paragraph.getCTP().getPPr();
-                        // 检查分节符带来的分页
-                        if (pPr.getSectPr() != null) {
-                            var sectPr = pPr.getSectPr();
-                            if (sectPr.getType() != null) {
-                                String type = sectPr.getType().getVal().toString();
-                                if ("nextPage".equals(type) || "oddPage".equals(type) || "evenPage".equals(type)) {
-                                    sb.append(PAGE_BREAK);
-                                    pageBreakCount++;
-                                }
-                            }
-                        }
-                    }
-                    
-                    sb.append("\n");
-                    
-                } else if (element instanceof XWPFTable) {
-                    XWPFTable table = (XWPFTable) element;
-                    for (XWPFTableRow row : table.getRows()) {
-                        for (XWPFTableCell cell : row.getTableCells()) {
-                            sb.append(cell.getText()).append("\t");
-                        }
-                        sb.append("\n");
-                    }
-                }
-            }
+             XWPFDocument document = new XWPFDocument(fis);
+             XWPFWordExtractor extractor = new XWPFWordExtractor(document)) {
             
-            // 检查文档末尾的分节符
-            if (document.getDocument().getBody().getSectPr() != null) {
-                // 文档末尾的分节符不需要额外处理
-            }
+            String text = extractor.getText();
+            log.debug("提取到文本长度: {}", text != null ? text.length() : 0);
+            return text != null ? text : "";
         }
-        
-        String result = sb.toString();
-        log.info("提取完成: 文本长度={}, 分页符数量={}", result.length(), pageBreakCount);
-        return result;
     }
     
     /**
      * 从.doc文件提取文本
-     * .doc 格式的分页符通常会被 WordExtractor 保留为 \f
      */
     private String extractFromDoc(String filePath) throws IOException {
         log.info("提取.doc文件文本: {}", filePath);
@@ -149,14 +78,7 @@ public class WordTextExtractionService {
              WordExtractor extractor = new WordExtractor(document)) {
             
             String text = extractor.getText();
-            int pageBreakCount = 0;
-            if (text != null) {
-                for (char c : text.toCharArray()) {
-                    if (c == PAGE_BREAK) pageBreakCount++;
-                }
-            }
-            log.info("提取完成: 文本长度={}, 分页符数量={}", 
-                    text != null ? text.length() : 0, pageBreakCount);
+            log.debug("提取到文本长度: {}", text != null ? text.length() : 0);
             return text != null ? text : "";
         }
     }