package com.lingyue.document.service; import com.lingyue.common.exception.ServiceException; import com.lingyue.document.dto.StructuredDocumentDTO; import com.lingyue.document.dto.StructuredDocumentDTO.*; import com.lingyue.document.entity.Document; import com.lingyue.document.entity.DocumentBlock; import com.lingyue.document.entity.DocumentBlock.TextElement; import com.lingyue.document.entity.DocumentElement; import com.lingyue.document.repository.DocumentBlockRepository; import com.lingyue.document.repository.DocumentRepository; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; import java.util.*; import java.util.stream.Collectors; /** * 结构化文档服务(参考飞书设计) * * 核心设计: * - 文档由 Block 树组成,每个 Block 包含 elements 数组 * - 实体作为 TextElement(type=entity)嵌入块中 * - 编辑时修改 elements 数组,无需处理字符偏移 * * @author lingyue * @since 2026-01-21 */ @Slf4j @Service @RequiredArgsConstructor public class StructuredDocumentService { private final DocumentRepository documentRepository; private final DocumentBlockRepository blockRepository; private final DocumentElementService documentElementService; /** * 获取结构化文档(用于编辑器渲染) */ public StructuredDocumentDTO getStructuredDocument(String documentId) { // 1. 获取文档基本信息 Document document = documentRepository.selectById(documentId); if (document == null) { return null; } // 2. 获取所有块 List blocks = blockRepository.findByDocumentId(documentId); // 3. 构建块 DTO 列表 List blockDTOs = blocks.stream() .map(this::buildBlockDTO) .collect(Collectors.toList()); // 4. 统计实体 EntityStats stats = buildEntityStats(blocks); // 5. 获取图片列表 List images = buildImageList(documentId); // 6. 获取段落列表(包含格式信息) List paragraphs = buildParagraphList(documentId); return StructuredDocumentDTO.builder() .documentId(documentId) .revision(1) // TODO: 实现版本控制 .title(document.getName()) .status(document.getStatus()) .blocks(blockDTOs) .images(images) .paragraphs(paragraphs) .entityStats(stats) .updatedAt(document.getUpdateTime()) .build(); } /** * 构建图片列表(从 document_elements 表获取) */ private List buildImageList(String documentId) { List imageElements = documentElementService.getImagesByDocumentId(documentId); return imageElements.stream() .map(el -> ImageDTO.builder() .index(el.getElementIndex()) .url(el.getImageUrl()) .alt(el.getImageAlt()) .width(el.getImageWidth()) .height(el.getImageHeight()) .format(el.getImageFormat()) .build()) .collect(Collectors.toList()); } /** * 构建段落列表(从 document_elements 表获取,包含格式信息) */ private List buildParagraphList(String documentId) { List elements = documentElementService.getElementsByDocumentId(documentId); return elements.stream() .filter(el -> el.getElementType() != null && !el.getElementType().equals("image") && !el.getElementType().equals("table")) .map(this::convertToParagraphDTO) .collect(Collectors.toList()); } /** * 将 DocumentElement 转换为 ParagraphDTO */ @SuppressWarnings("unchecked") private StructuredDocumentDTO.ParagraphDTO convertToParagraphDTO(DocumentElement el) { List runDTOs = null; if (el.getRuns() != null && !el.getRuns().isEmpty()) { runDTOs = el.getRuns().stream() .map(run -> StructuredDocumentDTO.TextRunDTO.builder() .text((String) run.get("text")) .fontFamily((String) run.get("fontFamily")) .fontSize(run.get("fontSize") instanceof Number ? ((Number) run.get("fontSize")).doubleValue() : null) .bold((Boolean) run.get("bold")) .italic((Boolean) run.get("italic")) .underline((String) run.get("underline")) .color((String) run.get("color")) .strikeThrough((Boolean) run.get("strikeThrough")) .verticalAlign((String) run.get("verticalAlign")) .highlightColor((String) run.get("highlightColor")) .build()) .collect(Collectors.toList()); } return StructuredDocumentDTO.ParagraphDTO.builder() .index(el.getElementIndex()) .type(el.getElementType()) .content(el.getContent()) .style(el.getStyle()) .runs(runDTOs) .build(); } /** * 构建块 DTO */ private BlockDTO buildBlockDTO(DocumentBlock block) { return BlockDTO.builder() .id(block.getId()) .parentId(block.getParentId()) .children(block.getChildren()) .index(block.getBlockIndex()) .type(block.getBlockType()) .elements(block.getElements()) .plainText(block.getPlainText()) .html(block.toHtml()) .markedHtml(block.toMarkedHtml()) .build(); } /** * 构建实体统计(从块的 elements 中提取) */ private EntityStats buildEntityStats(List blocks) { int total = 0; int confirmed = 0; Map byType = new HashMap<>(); for (DocumentBlock block : blocks) { if (block.getElements() == null) continue; for (TextElement el : block.getElements()) { if ("entity".equals(el.getType())) { total++; if (Boolean.TRUE.equals(el.getConfirmed())) { confirmed++; } String entityType = el.getEntityType(); if (entityType != null) { byType.merge(entityType, 1, Integer::sum); } } } } return EntityStats.builder() .total(total) .confirmed(confirmed) .byType(byType) .build(); } // ==================== 块操作 ==================== /** * 更新块的 elements */ @Transactional public void updateBlockElements(String blockId, List elements) { DocumentBlock block = blockRepository.selectById(blockId); if (block == null) { throw new ServiceException("块不存在: " + blockId); } block.setElements(elements); block.setUpdateTime(new Date()); blockRepository.updateById(block); log.info("更新块元素: blockId={}, elementCount={}", blockId, elements.size()); } /** * 在块内添加实体(将文本片段转为实体元素) * * @param blockId 块ID * @param elementIndex 要转换的元素索引 * @param startOffset 在该元素文本中的起始位置 * @param endOffset 在该元素文本中的结束位置 * @param entityType 实体类型 * @return 新创建的实体ID */ @Transactional public String markEntity(String blockId, int elementIndex, int startOffset, int endOffset, String entityType) { DocumentBlock block = blockRepository.selectById(blockId); if (block == null || block.getElements() == null) { throw new ServiceException("块不存在或没有内容"); } List elements = new ArrayList<>(block.getElements()); if (elementIndex >= elements.size()) { throw new ServiceException("元素索引越界"); } TextElement targetElement = elements.get(elementIndex); if (!"text_run".equals(targetElement.getType()) || targetElement.getContent() == null) { throw new ServiceException("只能在文本元素上标记实体"); } String content = targetElement.getContent(); if (startOffset < 0 || endOffset > content.length() || startOffset >= endOffset) { throw new ServiceException("偏移量无效"); } String entityId = UUID.randomUUID().toString().replace("-", ""); String entityText = content.substring(startOffset, endOffset); // 拆分元素:前段文本 + 实体 + 后段文本 List newElements = new ArrayList<>(); // 前段文本 if (startOffset > 0) { TextElement before = new TextElement(); before.setType("text_run"); before.setContent(content.substring(0, startOffset)); before.setStyle(targetElement.getStyle()); newElements.add(before); } // 实体元素 TextElement entity = new TextElement(); entity.setType("entity"); entity.setEntityId(entityId); entity.setEntityText(entityText); entity.setEntityType(entityType); entity.setConfirmed(true); // 手动标记的直接确认 newElements.add(entity); // 后段文本 if (endOffset < content.length()) { TextElement after = new TextElement(); after.setType("text_run"); after.setContent(content.substring(endOffset)); after.setStyle(targetElement.getStyle()); newElements.add(after); } // 替换原元素 elements.remove(elementIndex); elements.addAll(elementIndex, newElements); block.setElements(elements); block.setUpdateTime(new Date()); blockRepository.updateById(block); log.info("标记实体: blockId={}, entityId={}, text={}, type={}", blockId, entityId, entityText, entityType); return entityId; } /** * 删除实体标记(将实体元素还原为文本) */ @Transactional public void unmarkEntity(String blockId, String entityId) { DocumentBlock block = blockRepository.selectById(blockId); if (block == null || block.getElements() == null) { return; } List elements = new ArrayList<>(block.getElements()); for (int i = 0; i < elements.size(); i++) { TextElement el = elements.get(i); if ("entity".equals(el.getType()) && entityId.equals(el.getEntityId())) { // 将实体还原为文本 TextElement textEl = new TextElement(); textEl.setType("text_run"); textEl.setContent(el.getEntityText()); elements.set(i, textEl); break; } } // 合并相邻的文本元素 elements = mergeAdjacentTextRuns(elements); block.setElements(elements); block.setUpdateTime(new Date()); blockRepository.updateById(block); log.info("取消实体标记: blockId={}, entityId={}", blockId, entityId); } /** * 更新实体类型 */ @Transactional public void updateEntityType(String blockId, String entityId, String newType) { DocumentBlock block = blockRepository.selectById(blockId); if (block == null || block.getElements() == null) { return; } for (TextElement el : block.getElements()) { if ("entity".equals(el.getType()) && entityId.equals(el.getEntityId())) { el.setEntityType(newType); break; } } block.setUpdateTime(new Date()); blockRepository.updateById(block); } /** * 确认实体 */ @Transactional public void confirmEntity(String blockId, String entityId) { DocumentBlock block = blockRepository.selectById(blockId); if (block == null || block.getElements() == null) { return; } for (TextElement el : block.getElements()) { if ("entity".equals(el.getType()) && entityId.equals(el.getEntityId())) { el.setConfirmed(true); break; } } block.setUpdateTime(new Date()); blockRepository.updateById(block); } /** * 合并相邻的文本元素 */ private List mergeAdjacentTextRuns(List elements) { if (elements.size() <= 1) { return elements; } List merged = new ArrayList<>(); TextElement current = null; for (TextElement el : elements) { if ("text_run".equals(el.getType())) { if (current == null) { current = new TextElement(); current.setType("text_run"); current.setContent(el.getContent()); current.setStyle(el.getStyle()); } else { // 合并文本 current.setContent(current.getContent() + el.getContent()); } } else { if (current != null) { merged.add(current); current = null; } merged.add(el); } } if (current != null) { merged.add(current); } return merged; } // ==================== 块增删操作 ==================== /** * 创建新块 */ @Transactional public DocumentBlock createBlock(String documentId, String parentId, int index, String blockType, List elements) { DocumentBlock block = new DocumentBlock(); block.setId(UUID.randomUUID().toString().replace("-", "")); block.setDocumentId(documentId); block.setParentId(parentId); block.setBlockIndex(index); block.setBlockType(blockType); block.setElements(elements); block.setCreateTime(new Date()); block.setUpdateTime(new Date()); blockRepository.insert(block); // 更新父块的 children if (parentId != null) { DocumentBlock parent = blockRepository.selectById(parentId); if (parent != null) { List children = parent.getChildren(); if (children == null) { children = new ArrayList<>(); } children.add(block.getId()); parent.setChildren(children); blockRepository.updateById(parent); } } log.info("创建块: documentId={}, blockId={}, type={}", documentId, block.getId(), blockType); return block; } /** * 删除块 */ @Transactional public void deleteBlock(String blockId) { DocumentBlock block = blockRepository.selectById(blockId); if (block == null) { return; } // 递归删除子块 if (block.getChildren() != null) { for (String childId : block.getChildren()) { deleteBlock(childId); } } // 从父块的 children 中移除 if (block.getParentId() != null) { DocumentBlock parent = blockRepository.selectById(block.getParentId()); if (parent != null && parent.getChildren() != null) { parent.getChildren().remove(block.getId()); blockRepository.updateById(parent); } } blockRepository.deleteById(blockId); log.info("删除块: blockId={}", blockId); } // ==================== 批量操作 ==================== /** * 批量保存文档块(用于 NER 完成后生成结构化文档) * * @param documentId 文档ID * @param blocks 块列表(来自 DocumentBlockGeneratorService) * @return 保存的块数量 */ @Transactional public int saveBlocksBatch(String documentId, List> blocks) { if (blocks == null || blocks.isEmpty()) { log.warn("批量保存块: 块列表为空, documentId={}", documentId); return 0; } // 先删除该文档的旧块 int deleted = blockRepository.deleteByDocumentId(documentId); if (deleted > 0) { log.info("删除旧块: documentId={}, count={}", documentId, deleted); } // 保存新块 int savedCount = 0; for (Map blockMap : blocks) { try { DocumentBlock block = convertMapToBlock(blockMap); block.setDocumentId(documentId); block.setCreateTime(new Date()); block.setUpdateTime(new Date()); blockRepository.insert(block); savedCount++; } catch (Exception e) { log.error("保存块失败: documentId={}, block={}, error={}", documentId, blockMap, e.getMessage()); } } log.info("批量保存块完成: documentId={}, savedCount={}", documentId, savedCount); return savedCount; } /** * 将 Map 转换为 DocumentBlock */ @SuppressWarnings("unchecked") private DocumentBlock convertMapToBlock(Map blockMap) { DocumentBlock block = new DocumentBlock(); block.setId((String) blockMap.get("blockId")); block.setDocumentId((String) blockMap.get("documentId")); block.setParentId((String) blockMap.get("parentId")); block.setBlockIndex(getIntValue(blockMap, "blockIndex", 0)); block.setBlockType((String) blockMap.get("blockType")); // 转换 children Object childrenObj = blockMap.get("children"); if (childrenObj instanceof List) { block.setChildren((List) childrenObj); } // 转换 elements Object elementsObj = blockMap.get("elements"); if (elementsObj instanceof List) { List> elementMaps = (List>) elementsObj; List elements = new ArrayList<>(); for (Map elMap : elementMaps) { TextElement el = new TextElement(); el.setType((String) elMap.get("type")); el.setContent((String) elMap.get("content")); el.setEntityId((String) elMap.get("entityId")); el.setEntityText((String) elMap.get("entityText")); el.setEntityType((String) elMap.get("entityType")); el.setConfirmed((Boolean) elMap.get("confirmed")); el.setUrl((String) elMap.get("url")); el.setRefDocId((String) elMap.get("refDocId")); el.setRefDocTitle((String) elMap.get("refDocTitle")); elements.add(el); } block.setElements(elements); } return block; } private int getIntValue(Map map, String key, int defaultValue) { Object value = map.get(key); if (value instanceof Number) { return ((Number) value).intValue(); } return defaultValue; } }