StructuredDocumentService.java 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517
  1. package com.lingyue.document.service;
  2. import com.lingyue.common.exception.ServiceException;
  3. import com.lingyue.document.dto.StructuredDocumentDTO;
  4. import com.lingyue.document.dto.StructuredDocumentDTO.*;
  5. import com.lingyue.document.entity.Document;
  6. import com.lingyue.document.entity.DocumentBlock;
  7. import com.lingyue.document.entity.DocumentBlock.TextElement;
  8. import com.lingyue.document.entity.DocumentElement;
  9. import com.lingyue.document.repository.DocumentBlockRepository;
  10. import com.lingyue.document.repository.DocumentRepository;
  11. import lombok.RequiredArgsConstructor;
  12. import lombok.extern.slf4j.Slf4j;
  13. import org.springframework.stereotype.Service;
  14. import org.springframework.transaction.annotation.Transactional;
  15. import java.util.*;
  16. import java.util.stream.Collectors;
  17. /**
  18. * 结构化文档服务(参考飞书设计)
  19. *
  20. * 核心设计:
  21. * - 文档由 Block 树组成,每个 Block 包含 elements 数组
  22. * - 实体作为 TextElement(type=entity)嵌入块中
  23. * - 编辑时修改 elements 数组,无需处理字符偏移
  24. *
  25. * @author lingyue
  26. * @since 2026-01-21
  27. */
  28. @Slf4j
  29. @Service
  30. @RequiredArgsConstructor
  31. public class StructuredDocumentService {
  32. private final DocumentRepository documentRepository;
  33. private final DocumentBlockRepository blockRepository;
  34. private final DocumentElementService documentElementService;
  35. /**
  36. * 获取结构化文档(用于编辑器渲染)
  37. */
  38. public StructuredDocumentDTO getStructuredDocument(String documentId) {
  39. // 1. 获取文档基本信息
  40. Document document = documentRepository.selectById(documentId);
  41. if (document == null) {
  42. return null;
  43. }
  44. // 2. 获取所有块
  45. List<DocumentBlock> blocks = blockRepository.findByDocumentId(documentId);
  46. // 3. 构建块 DTO 列表
  47. List<BlockDTO> blockDTOs = blocks.stream()
  48. .map(this::buildBlockDTO)
  49. .collect(Collectors.toList());
  50. // 4. 统计实体
  51. EntityStats stats = buildEntityStats(blocks);
  52. // 5. 获取图片列表
  53. List<ImageDTO> images = buildImageList(documentId);
  54. return StructuredDocumentDTO.builder()
  55. .documentId(documentId)
  56. .revision(1) // TODO: 实现版本控制
  57. .title(document.getName())
  58. .status(document.getStatus())
  59. .blocks(blockDTOs)
  60. .images(images)
  61. .entityStats(stats)
  62. .updatedAt(document.getUpdateTime())
  63. .build();
  64. }
  65. /**
  66. * 构建图片列表(从 document_elements 表获取)
  67. */
  68. private List<ImageDTO> buildImageList(String documentId) {
  69. List<DocumentElement> imageElements = documentElementService.getImagesByDocumentId(documentId);
  70. return imageElements.stream()
  71. .map(el -> ImageDTO.builder()
  72. .index(el.getElementIndex())
  73. .url(el.getImageUrl())
  74. .alt(el.getImageAlt())
  75. .width(el.getImageWidth())
  76. .height(el.getImageHeight())
  77. .format(el.getImageFormat())
  78. .build())
  79. .collect(Collectors.toList());
  80. }
  81. /**
  82. * 构建块 DTO
  83. */
  84. private BlockDTO buildBlockDTO(DocumentBlock block) {
  85. return BlockDTO.builder()
  86. .id(block.getId())
  87. .parentId(block.getParentId())
  88. .children(block.getChildren())
  89. .index(block.getBlockIndex())
  90. .type(block.getBlockType())
  91. .elements(block.getElements())
  92. .plainText(block.getPlainText())
  93. .html(block.toHtml())
  94. .markedHtml(block.toMarkedHtml())
  95. .build();
  96. }
  97. /**
  98. * 构建实体统计(从块的 elements 中提取)
  99. */
  100. private EntityStats buildEntityStats(List<DocumentBlock> blocks) {
  101. int total = 0;
  102. int confirmed = 0;
  103. Map<String, Integer> byType = new HashMap<>();
  104. for (DocumentBlock block : blocks) {
  105. if (block.getElements() == null) continue;
  106. for (TextElement el : block.getElements()) {
  107. if ("entity".equals(el.getType())) {
  108. total++;
  109. if (Boolean.TRUE.equals(el.getConfirmed())) {
  110. confirmed++;
  111. }
  112. String entityType = el.getEntityType();
  113. if (entityType != null) {
  114. byType.merge(entityType, 1, Integer::sum);
  115. }
  116. }
  117. }
  118. }
  119. return EntityStats.builder()
  120. .total(total)
  121. .confirmed(confirmed)
  122. .byType(byType)
  123. .build();
  124. }
  125. // ==================== 块操作 ====================
  126. /**
  127. * 更新块的 elements
  128. */
  129. @Transactional
  130. public void updateBlockElements(String blockId, List<TextElement> elements) {
  131. DocumentBlock block = blockRepository.selectById(blockId);
  132. if (block == null) {
  133. throw new ServiceException("块不存在: " + blockId);
  134. }
  135. block.setElements(elements);
  136. block.setUpdateTime(new Date());
  137. blockRepository.updateById(block);
  138. log.info("更新块元素: blockId={}, elementCount={}", blockId, elements.size());
  139. }
  140. /**
  141. * 在块内添加实体(将文本片段转为实体元素)
  142. *
  143. * @param blockId 块ID
  144. * @param elementIndex 要转换的元素索引
  145. * @param startOffset 在该元素文本中的起始位置
  146. * @param endOffset 在该元素文本中的结束位置
  147. * @param entityType 实体类型
  148. * @return 新创建的实体ID
  149. */
  150. @Transactional
  151. public String markEntity(String blockId, int elementIndex, int startOffset, int endOffset, String entityType) {
  152. DocumentBlock block = blockRepository.selectById(blockId);
  153. if (block == null || block.getElements() == null) {
  154. throw new ServiceException("块不存在或没有内容");
  155. }
  156. List<TextElement> elements = new ArrayList<>(block.getElements());
  157. if (elementIndex >= elements.size()) {
  158. throw new ServiceException("元素索引越界");
  159. }
  160. TextElement targetElement = elements.get(elementIndex);
  161. if (!"text_run".equals(targetElement.getType()) || targetElement.getContent() == null) {
  162. throw new ServiceException("只能在文本元素上标记实体");
  163. }
  164. String content = targetElement.getContent();
  165. if (startOffset < 0 || endOffset > content.length() || startOffset >= endOffset) {
  166. throw new ServiceException("偏移量无效");
  167. }
  168. String entityId = UUID.randomUUID().toString().replace("-", "");
  169. String entityText = content.substring(startOffset, endOffset);
  170. // 拆分元素:前段文本 + 实体 + 后段文本
  171. List<TextElement> newElements = new ArrayList<>();
  172. // 前段文本
  173. if (startOffset > 0) {
  174. TextElement before = new TextElement();
  175. before.setType("text_run");
  176. before.setContent(content.substring(0, startOffset));
  177. before.setStyle(targetElement.getStyle());
  178. newElements.add(before);
  179. }
  180. // 实体元素
  181. TextElement entity = new TextElement();
  182. entity.setType("entity");
  183. entity.setEntityId(entityId);
  184. entity.setEntityText(entityText);
  185. entity.setEntityType(entityType);
  186. entity.setConfirmed(true); // 手动标记的直接确认
  187. newElements.add(entity);
  188. // 后段文本
  189. if (endOffset < content.length()) {
  190. TextElement after = new TextElement();
  191. after.setType("text_run");
  192. after.setContent(content.substring(endOffset));
  193. after.setStyle(targetElement.getStyle());
  194. newElements.add(after);
  195. }
  196. // 替换原元素
  197. elements.remove(elementIndex);
  198. elements.addAll(elementIndex, newElements);
  199. block.setElements(elements);
  200. block.setUpdateTime(new Date());
  201. blockRepository.updateById(block);
  202. log.info("标记实体: blockId={}, entityId={}, text={}, type={}", blockId, entityId, entityText, entityType);
  203. return entityId;
  204. }
  205. /**
  206. * 删除实体标记(将实体元素还原为文本)
  207. */
  208. @Transactional
  209. public void unmarkEntity(String blockId, String entityId) {
  210. DocumentBlock block = blockRepository.selectById(blockId);
  211. if (block == null || block.getElements() == null) {
  212. return;
  213. }
  214. List<TextElement> elements = new ArrayList<>(block.getElements());
  215. for (int i = 0; i < elements.size(); i++) {
  216. TextElement el = elements.get(i);
  217. if ("entity".equals(el.getType()) && entityId.equals(el.getEntityId())) {
  218. // 将实体还原为文本
  219. TextElement textEl = new TextElement();
  220. textEl.setType("text_run");
  221. textEl.setContent(el.getEntityText());
  222. elements.set(i, textEl);
  223. break;
  224. }
  225. }
  226. // 合并相邻的文本元素
  227. elements = mergeAdjacentTextRuns(elements);
  228. block.setElements(elements);
  229. block.setUpdateTime(new Date());
  230. blockRepository.updateById(block);
  231. log.info("取消实体标记: blockId={}, entityId={}", blockId, entityId);
  232. }
  233. /**
  234. * 更新实体类型
  235. */
  236. @Transactional
  237. public void updateEntityType(String blockId, String entityId, String newType) {
  238. DocumentBlock block = blockRepository.selectById(blockId);
  239. if (block == null || block.getElements() == null) {
  240. return;
  241. }
  242. for (TextElement el : block.getElements()) {
  243. if ("entity".equals(el.getType()) && entityId.equals(el.getEntityId())) {
  244. el.setEntityType(newType);
  245. break;
  246. }
  247. }
  248. block.setUpdateTime(new Date());
  249. blockRepository.updateById(block);
  250. }
  251. /**
  252. * 确认实体
  253. */
  254. @Transactional
  255. public void confirmEntity(String blockId, String entityId) {
  256. DocumentBlock block = blockRepository.selectById(blockId);
  257. if (block == null || block.getElements() == null) {
  258. return;
  259. }
  260. for (TextElement el : block.getElements()) {
  261. if ("entity".equals(el.getType()) && entityId.equals(el.getEntityId())) {
  262. el.setConfirmed(true);
  263. break;
  264. }
  265. }
  266. block.setUpdateTime(new Date());
  267. blockRepository.updateById(block);
  268. }
  269. /**
  270. * 合并相邻的文本元素
  271. */
  272. private List<TextElement> mergeAdjacentTextRuns(List<TextElement> elements) {
  273. if (elements.size() <= 1) {
  274. return elements;
  275. }
  276. List<TextElement> merged = new ArrayList<>();
  277. TextElement current = null;
  278. for (TextElement el : elements) {
  279. if ("text_run".equals(el.getType())) {
  280. if (current == null) {
  281. current = new TextElement();
  282. current.setType("text_run");
  283. current.setContent(el.getContent());
  284. current.setStyle(el.getStyle());
  285. } else {
  286. // 合并文本
  287. current.setContent(current.getContent() + el.getContent());
  288. }
  289. } else {
  290. if (current != null) {
  291. merged.add(current);
  292. current = null;
  293. }
  294. merged.add(el);
  295. }
  296. }
  297. if (current != null) {
  298. merged.add(current);
  299. }
  300. return merged;
  301. }
  302. // ==================== 块增删操作 ====================
  303. /**
  304. * 创建新块
  305. */
  306. @Transactional
  307. public DocumentBlock createBlock(String documentId, String parentId, int index,
  308. String blockType, List<TextElement> elements) {
  309. DocumentBlock block = new DocumentBlock();
  310. block.setId(UUID.randomUUID().toString().replace("-", ""));
  311. block.setDocumentId(documentId);
  312. block.setParentId(parentId);
  313. block.setBlockIndex(index);
  314. block.setBlockType(blockType);
  315. block.setElements(elements);
  316. block.setCreateTime(new Date());
  317. block.setUpdateTime(new Date());
  318. blockRepository.insert(block);
  319. // 更新父块的 children
  320. if (parentId != null) {
  321. DocumentBlock parent = blockRepository.selectById(parentId);
  322. if (parent != null) {
  323. List<String> children = parent.getChildren();
  324. if (children == null) {
  325. children = new ArrayList<>();
  326. }
  327. children.add(block.getId());
  328. parent.setChildren(children);
  329. blockRepository.updateById(parent);
  330. }
  331. }
  332. log.info("创建块: documentId={}, blockId={}, type={}", documentId, block.getId(), blockType);
  333. return block;
  334. }
  335. /**
  336. * 删除块
  337. */
  338. @Transactional
  339. public void deleteBlock(String blockId) {
  340. DocumentBlock block = blockRepository.selectById(blockId);
  341. if (block == null) {
  342. return;
  343. }
  344. // 递归删除子块
  345. if (block.getChildren() != null) {
  346. for (String childId : block.getChildren()) {
  347. deleteBlock(childId);
  348. }
  349. }
  350. // 从父块的 children 中移除
  351. if (block.getParentId() != null) {
  352. DocumentBlock parent = blockRepository.selectById(block.getParentId());
  353. if (parent != null && parent.getChildren() != null) {
  354. parent.getChildren().remove(block.getId());
  355. blockRepository.updateById(parent);
  356. }
  357. }
  358. blockRepository.deleteById(blockId);
  359. log.info("删除块: blockId={}", blockId);
  360. }
  361. // ==================== 批量操作 ====================
  362. /**
  363. * 批量保存文档块(用于 NER 完成后生成结构化文档)
  364. *
  365. * @param documentId 文档ID
  366. * @param blocks 块列表(来自 DocumentBlockGeneratorService)
  367. * @return 保存的块数量
  368. */
  369. @Transactional
  370. public int saveBlocksBatch(String documentId, List<Map<String, Object>> blocks) {
  371. if (blocks == null || blocks.isEmpty()) {
  372. log.warn("批量保存块: 块列表为空, documentId={}", documentId);
  373. return 0;
  374. }
  375. // 先删除该文档的旧块
  376. int deleted = blockRepository.deleteByDocumentId(documentId);
  377. if (deleted > 0) {
  378. log.info("删除旧块: documentId={}, count={}", documentId, deleted);
  379. }
  380. // 保存新块
  381. int savedCount = 0;
  382. for (Map<String, Object> blockMap : blocks) {
  383. try {
  384. DocumentBlock block = convertMapToBlock(blockMap);
  385. block.setDocumentId(documentId);
  386. block.setCreateTime(new Date());
  387. block.setUpdateTime(new Date());
  388. blockRepository.insert(block);
  389. savedCount++;
  390. } catch (Exception e) {
  391. log.error("保存块失败: documentId={}, block={}, error={}",
  392. documentId, blockMap, e.getMessage());
  393. }
  394. }
  395. log.info("批量保存块完成: documentId={}, savedCount={}", documentId, savedCount);
  396. return savedCount;
  397. }
  398. /**
  399. * 将 Map 转换为 DocumentBlock
  400. */
  401. @SuppressWarnings("unchecked")
  402. private DocumentBlock convertMapToBlock(Map<String, Object> blockMap) {
  403. DocumentBlock block = new DocumentBlock();
  404. block.setId((String) blockMap.get("blockId"));
  405. block.setDocumentId((String) blockMap.get("documentId"));
  406. block.setParentId((String) blockMap.get("parentId"));
  407. block.setBlockIndex(getIntValue(blockMap, "blockIndex", 0));
  408. block.setBlockType((String) blockMap.get("blockType"));
  409. // 转换 children
  410. Object childrenObj = blockMap.get("children");
  411. if (childrenObj instanceof List) {
  412. block.setChildren((List<String>) childrenObj);
  413. }
  414. // 转换 elements
  415. Object elementsObj = blockMap.get("elements");
  416. if (elementsObj instanceof List) {
  417. List<Map<String, Object>> elementMaps = (List<Map<String, Object>>) elementsObj;
  418. List<TextElement> elements = new ArrayList<>();
  419. for (Map<String, Object> elMap : elementMaps) {
  420. TextElement el = new TextElement();
  421. el.setType((String) elMap.get("type"));
  422. el.setContent((String) elMap.get("content"));
  423. el.setEntityId((String) elMap.get("entityId"));
  424. el.setEntityText((String) elMap.get("entityText"));
  425. el.setEntityType((String) elMap.get("entityType"));
  426. el.setConfirmed((Boolean) elMap.get("confirmed"));
  427. el.setUrl((String) elMap.get("url"));
  428. el.setRefDocId((String) elMap.get("refDocId"));
  429. el.setRefDocTitle((String) elMap.get("refDocTitle"));
  430. elements.add(el);
  431. }
  432. block.setElements(elements);
  433. }
  434. return block;
  435. }
  436. private int getIntValue(Map<String, Object> map, String key, int defaultValue) {
  437. Object value = map.get(key);
  438. if (value instanceof Number) {
  439. return ((Number) value).intValue();
  440. }
  441. return defaultValue;
  442. }
  443. }