hewensong
/
lingyue-zhibao


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
							package com.lingyue.parse.service;

import com.lingyue.parse.config.FileStorageProperties;
import com.lingyue.parse.entity.ParseTask;
import com.lingyue.parse.enums.FileType;
import com.lingyue.parse.repository.ParseTaskRepository;
import com.lingyue.parse.util.ErrorCategory;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;

import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;

/**
 * 解析服务
 *
 * 负责管理解析任务、调用 OCR 服务以及将解析后的文本写入 TXT 文件。
 */
@Slf4j
@Service
@RequiredArgsConstructor
public class ParseService {

    private final ParseTaskRepository parseTaskRepository;
    private final PaddleOcrClient paddleOcrClient;
    private final PdfTextExtractionService pdfTextExtractionService;
    private final WordTextExtractionService wordTextExtractionService;
    private final ExcelTextExtractionService excelTextExtractionService;
    private final OcrResultParser ocrResultParser;
    private final LayoutAnalysisService layoutAnalysisService;
    private final DocumentIndexService documentIndexService;
    private final FileStorageProperties fileStorageProperties;
    // 单体应用直接注入 Service，不使用 Feign Client
    private final com.lingyue.graph.service.TextStorageService textStorageService;

    /**
     * 根据ID获取解析任务
     */
    public ParseTask getParseTaskById(String taskId) {
        return parseTaskRepository.selectById(taskId);
    }

    /**
     * 根据文档ID获取解析任务
     */
    public ParseTask getParseTaskByDocumentId(String documentId) {
        return parseTaskRepository.findByDocumentId(documentId);
    }

    /**
     * 保存解析任务
     */
    public ParseTask saveParseTask(ParseTask parseTask) {
        if (parseTask.getId() == null) {
            parseTask.setId(java.util.UUID.randomUUID().toString().replace("-", ""));
            parseTask.setCreateTime(new java.util.Date());
            parseTask.setStartedAt(new java.util.Date());
            parseTaskRepository.insert(parseTask);
        } else {
            parseTask.setUpdateTime(new java.util.Date());
            parseTaskRepository.updateById(parseTask);
        }
        return parseTask;
    }

    /**
     * 对指定文档执行解析并将结果写入 TXT 文件
     * 根据文件类型选择不同的处理方式：
     * - PDF: 使用分页判断逻辑（有文本层直接提取，无文本层使用OCR）
     * - 图片: 使用OCR
     * - 其他: 使用OCR
     *
     * @param documentId 文档ID
     * @param sourceFilePath 原始文件路径
     * @param fileType 文件类型
     * @return 更新后的解析任务
     */
    public ParseTask parseAndSaveText(String documentId, String sourceFilePath, FileType fileType) {
        // 1. 初始化或更新解析任务
        ParseTask task = getOrCreateTask(documentId);
        task.setStatus("processing");
        task.setCurrentStep("parsing");
        task.setProgress(10);
        saveParseTask(task);

        try {
            String plainText;
            
            // 2. 根据文件类型选择处理方式
            if (fileType == FileType.PDF) {
                log.info("处理PDF文件: {}", sourceFilePath);
                task.setCurrentStep("pdf_extraction");
                task.setProgress(20);
                saveParseTask(task);
                
                // PDF使用分页判断逻辑，并生成页面索引
                String indexFilePath = buildIndexFilePath(documentId);
                PdfTextExtractionService.ExtractionResult extractionResult = 
                    pdfTextExtractionService.extractTextWithIndex(sourceFilePath, documentId, indexFilePath);
                plainText = extractionResult.getText();
                log.info("PDF提取完成，索引文件: {}", indexFilePath);
            } else if (fileType == FileType.WORD || fileType == FileType.WORD_OLD) {
                log.info("处理Word文件: {}", sourceFilePath);
                task.setCurrentStep("word_extraction");
                task.setProgress(20);
                saveParseTask(task);
                
                // Word文档直接提取文本
                plainText = wordTextExtractionService.extractText(sourceFilePath);
                
                // 为Word生成行索引
                String indexFilePath = buildIndexFilePath(documentId);
                documentIndexService.generateLineIndex(plainText, documentId, indexFilePath);
                log.info("Word提取完成，索引文件: {}", indexFilePath);
            } else if (fileType == FileType.EXCEL || fileType == FileType.EXCEL_OLD) {
                log.info("处理Excel文件: {}", sourceFilePath);
                task.setCurrentStep("excel_extraction");
                task.setProgress(20);
                saveParseTask(task);
                
                // Excel表格直接提取文本
                plainText = excelTextExtractionService.extractText(sourceFilePath);
                
                // 为Excel生成行索引
                String indexFilePath = buildIndexFilePath(documentId);
                documentIndexService.generateLineIndex(plainText, documentId, indexFilePath);
                log.info("Excel提取完成，索引文件: {}", indexFilePath);
            } else if (fileType.isImage()) {
                log.info("处理图片文件: {}", sourceFilePath);
                task.setCurrentStep("ocr");
                task.setProgress(20);
                saveParseTask(task);
                
                // 图片使用OCR
                String ocrResult = paddleOcrClient.ocrFile(sourceFilePath);
                plainText = ocrResultParser.parseText(ocrResult);
            } else {
                log.info("处理其他文件类型: {}, 使用OCR", fileType);
                task.setCurrentStep("ocr");
                task.setProgress(20);
                saveParseTask(task);
                
                // 其他文件类型使用OCR
                String ocrResult = paddleOcrClient.ocrFile(sourceFilePath);
                plainText = ocrResultParser.parseText(ocrResult);
            }

            // 3. 将纯文本写入 TXT 文件
            task.setCurrentStep("saving");
            task.setProgress(80);
            saveParseTask(task);
            
            String textFilePath = buildTextFilePath(documentId);
            try {
                writeTextToFile(textFilePath, plainText);
            } catch (IOException ioException) {
                log.error("写入文本到 TXT 文件失败, path={}", textFilePath, ioException);
                throw new RuntimeException("写入文本失败: " + ioException.getMessage(), ioException);
            }
            log.info("文本已写入: {}", textFilePath);

            // 4. 版面分析
            task.setCurrentStep("layout_analysis");
            task.setProgress(85);
            saveParseTask(task);
            
            try {
                LayoutAnalysisService.LayoutAnalysisResult layoutResult = 
                    layoutAnalysisService.analyzeLayout(sourceFilePath, fileType, plainText);
                log.info("版面分析完成: 识别到 {} 个元素", layoutResult.getElementCount());
                
                // 将版面分析结果保存到任务选项（可选，用于后续图节点构建）
                if (task.getOptions() == null) {
                    task.setOptions(new java.util.HashMap<>());
                }
                java.util.Map<String, Object> options = (java.util.Map<String, Object>) task.getOptions();
                options.put("layoutAnalysis", layoutResult);
            } catch (Exception e) {
                log.warn("版面分析失败，但不影响主流程: documentId={}", documentId, e);
                // 版面分析失败不影响主流程，只记录警告日志
            }

            // 5. 记录文本存储路径到数据库
            task.setCurrentStep("recording");
            task.setProgress(90);
            saveParseTask(task);
            
            try {
                recordTextStorage(documentId, textFilePath);
            } catch (Exception e) {
                log.warn("记录文本存储路径失败，但不影响主流程: documentId={}, filePath={}", documentId, textFilePath, e);
                // 记录失败不影响主流程，只记录警告日志
            }

            // 6. 更新任务状态为完成
            task.setStatus("completed");
            task.setCurrentStep("completed");
            task.setProgress(100);
            task.setCompletedAt(new java.util.Date());
            saveParseTask(task);
        } catch (Exception e) {
            // 错误分类和处理
            ErrorCategory errorCategory = ErrorCategory.categorize(e);
            String errorMessage = String.format("[%s] %s", errorCategory.getDescription(), e.getMessage());
            
            log.error("执行解析任务失败, documentId={}, errorCategory={}, retryable={}", 
                    documentId, errorCategory.getDescription(), errorCategory.isRetryable(), e);
            
            task.setStatus("failed");
            task.setCurrentStep("failed");
            task.setErrorMessage(errorMessage);
            
            // 保存错误信息到任务选项
            if (task.getOptions() == null) {
                task.setOptions(new java.util.HashMap<>());
            }
            java.util.Map<String, Object> options = (java.util.Map<String, Object>) task.getOptions();
            options.put("errorCategory", errorCategory.name());
            options.put("retryable", errorCategory.isRetryable());
            
            saveParseTask(task);
            throw e;
        }

        return task;
    }
    
    /**
     * 对指定文档执行 OCR 并将结果写入 TXT 文件（兼容旧接口）
     *
     * @param documentId 文档ID
     * @param sourceFilePath 原始文件路径
     * @return 更新后的解析任务
     */
    @Deprecated
    public ParseTask runOcrAndSaveText(String documentId, String sourceFilePath) {
        // 自动检测文件类型
        FileType fileType = detectFileType(sourceFilePath);
        return parseAndSaveText(documentId, sourceFilePath, fileType);
    }
    
    /**
     * 检测文件类型
     */
    private FileType detectFileType(String filePath) {
        File file = new File(filePath);
        String fileName = file.getName();
        String extension = "";
        if (fileName.contains(".")) {
            extension = fileName.substring(fileName.lastIndexOf(".") + 1).toLowerCase();
        }
        return FileType.fromExtension(extension);
    }

    /**
     * 获取或创建解析任务
     */
    private ParseTask getOrCreateTask(String documentId) {
        ParseTask existing = parseTaskRepository.findByDocumentId(documentId);
        if (existing != null) {
            return existing;
        }
        ParseTask task = new ParseTask();
        task.setDocumentId(documentId);
        task.setStatus("pending");
        task.setProgress(0);
        return task;
    }

    /**
     * 根据文档ID构建 TXT 文件存储路径
     */
    private String buildTextFilePath(String documentId) {
        Path path = Path.of(
                fileStorageProperties.getTextPath(),
                documentId.substring(0, 2),
                documentId + ".txt"
        );
        return path.toString();
    }
    
    /**
     * 根据文档ID构建索引文件存储路径
     */
    private String buildIndexFilePath(String documentId) {
        Path path = Path.of(
                fileStorageProperties.getTextPath(),
                documentId.substring(0, 2),
                documentId + "_index.json"
        );
        return path.toString();
    }

    /**
     * 将纯文本写入 TXT 文件
     * 对于大文件使用分块写入，避免内存溢出
     */
    private void writeTextToFile(String textFilePath, String content) throws IOException {
        Path path = Path.of(textFilePath);
        Files.createDirectories(path.getParent());
        
        // 如果内容较大，使用分块写入
        long contentSize = content.getBytes(StandardCharsets.UTF_8).length;
        if (contentSize > 50 * 1024 * 1024) { // 50MB
            log.info("文本内容较大 ({} MB)，使用分块写入: {}", contentSize / (1024.0 * 1024.0), textFilePath);
            com.lingyue.parse.util.FileChunkProcessor.writeTextFileInChunks(
                    textFilePath, content, 10 * 1024 * 1024); // 10MB块
        } else {
            Files.writeString(path, content, StandardCharsets.UTF_8);
        }
    }

    /**
     * 从 OCR 返回结果中提取纯文本（兼容旧接口）
     * 
     * @deprecated 使用 OcrResultParser.parseText() 替代
     */
    @Deprecated
    private String extractPlainTextFromOcrResult(String ocrResult) {
        return ocrResultParser.parseText(ocrResult);
    }

    /**
     * 记录文本存储路径到数据库并自动建立 RAG 索引
     * 单体应用模式：直接调用 Service 层
     * 
     * @param documentId 文档ID
     * @param textFilePath 文本文件路径
     */
    private void recordTextStorage(String documentId, String textFilePath) {
        try {
            // 使用 saveAndIndex 方法，保存文本的同时自动建立 RAG 索引
            textStorageService.saveAndIndex(documentId, textFilePath);
            log.info("文本存储路径记录并建立索引成功: documentId={}, filePath={}", documentId, textFilePath);
        } catch (Exception e) {
            log.error("记录文本存储路径异常: documentId={}, filePath={}", documentId, textFilePath, e);
            // 记录失败不影响主流程，只记录日志
        }
    }

}