|
|
@@ -0,0 +1,414 @@
|
|
|
+package com.lingyue.parse.service;
|
|
|
+
|
|
|
+import com.lingyue.common.exception.ServiceException;
|
|
|
+import lombok.Data;
|
|
|
+import lombok.RequiredArgsConstructor;
|
|
|
+import lombok.extern.slf4j.Slf4j;
|
|
|
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
|
|
+import org.apache.poi.util.Units;
|
|
|
+import org.apache.poi.xwpf.usermodel.*;
|
|
|
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDrawing;
|
|
|
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
|
|
|
+import org.springframework.beans.factory.annotation.Value;
|
|
|
+import org.springframework.stereotype.Service;
|
|
|
+
|
|
|
+import java.io.*;
|
|
|
+import java.nio.file.Files;
|
|
|
+import java.nio.file.Path;
|
|
|
+import java.nio.file.Paths;
|
|
|
+import java.util.*;
|
|
|
+
|
|
|
+/**
|
|
|
+ * Word 文档结构化提取服务
|
|
|
+ * 按顺序提取段落、图片、表格,保持原始排版结构
|
|
|
+ *
|
|
|
+ * @author lingyue
|
|
|
+ * @since 2026-01-21
|
|
|
+ */
|
|
|
+@Slf4j
|
|
|
+@Service
|
|
|
+@RequiredArgsConstructor
|
|
|
+public class WordStructuredExtractionService {
|
|
|
+
|
|
|
+ @Value("${file.storage.base-path:/data/lingyue}")
|
|
|
+ private String basePath;
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 提取 Word 文档结构化内容
|
|
|
+ *
|
|
|
+ * @param wordFilePath Word 文件路径
|
|
|
+ * @param documentId 文档 ID(用于存储图片)
|
|
|
+ * @return 结构化内容列表
|
|
|
+ */
|
|
|
+ public WordStructuredResult extractStructured(String wordFilePath, String documentId) {
|
|
|
+ File wordFile = new File(wordFilePath);
|
|
|
+ if (!wordFile.exists()) {
|
|
|
+ throw new ServiceException("Word文件不存在: " + wordFilePath);
|
|
|
+ }
|
|
|
+
|
|
|
+ String fileName = wordFile.getName().toLowerCase();
|
|
|
+ if (!fileName.endsWith(".docx")) {
|
|
|
+ throw new ServiceException("仅支持 .docx 格式的结构化提取: " + fileName);
|
|
|
+ }
|
|
|
+
|
|
|
+ try {
|
|
|
+ return extractFromDocx(wordFilePath, documentId);
|
|
|
+ } catch (IOException e) {
|
|
|
+ log.error("提取Word文档结构化内容失败: {}", wordFilePath, e);
|
|
|
+ throw new ServiceException("提取Word文档结构化内容失败: " + e.getMessage());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 从 .docx 文件提取结构化内容
|
|
|
+ */
|
|
|
+ private WordStructuredResult extractFromDocx(String filePath, String documentId) throws IOException {
|
|
|
+ log.info("开始提取 .docx 结构化内容: {}", filePath);
|
|
|
+
|
|
|
+ WordStructuredResult result = new WordStructuredResult();
|
|
|
+ result.setDocumentId(documentId);
|
|
|
+ List<ContentElement> elements = new ArrayList<>();
|
|
|
+ StringBuilder fullText = new StringBuilder();
|
|
|
+
|
|
|
+ // 创建图片存储目录
|
|
|
+ String imageDir = basePath + "/images/" + documentId;
|
|
|
+ Files.createDirectories(Paths.get(imageDir));
|
|
|
+
|
|
|
+ try (FileInputStream fis = new FileInputStream(filePath);
|
|
|
+ XWPFDocument document = new XWPFDocument(fis)) {
|
|
|
+
|
|
|
+ int elementIndex = 0;
|
|
|
+ int imageIndex = 0;
|
|
|
+ int tableIndex = 0;
|
|
|
+
|
|
|
+ // 遍历文档体中的所有元素(保持顺序)
|
|
|
+ for (IBodyElement bodyElement : document.getBodyElements()) {
|
|
|
+ if (bodyElement instanceof XWPFParagraph) {
|
|
|
+ XWPFParagraph paragraph = (XWPFParagraph) bodyElement;
|
|
|
+
|
|
|
+ // 检查段落中是否有图片
|
|
|
+ List<XWPFRun> runs = paragraph.getRuns();
|
|
|
+ boolean hasImage = false;
|
|
|
+
|
|
|
+ for (XWPFRun run : runs) {
|
|
|
+ List<XWPFPicture> pictures = run.getEmbeddedPictures();
|
|
|
+ for (XWPFPicture picture : pictures) {
|
|
|
+ hasImage = true;
|
|
|
+ // 提取图片
|
|
|
+ ContentElement imgElement = extractImage(picture, imageDir, documentId, imageIndex++, elementIndex++);
|
|
|
+ if (imgElement != null) {
|
|
|
+ elements.add(imgElement);
|
|
|
+ fullText.append("[图片: ").append(imgElement.getImageAlt()).append("]\n");
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 提取段落文本
|
|
|
+ String text = paragraph.getText();
|
|
|
+ if (text != null && !text.trim().isEmpty()) {
|
|
|
+ ContentElement textElement = new ContentElement();
|
|
|
+ textElement.setIndex(elementIndex++);
|
|
|
+ textElement.setType(detectParagraphType(paragraph, text));
|
|
|
+ textElement.setContent(text.trim());
|
|
|
+ textElement.setStyle(extractParagraphStyle(paragraph));
|
|
|
+
|
|
|
+ elements.add(textElement);
|
|
|
+ fullText.append(text).append("\n");
|
|
|
+ }
|
|
|
+
|
|
|
+ } else if (bodyElement instanceof XWPFTable) {
|
|
|
+ XWPFTable table = (XWPFTable) bodyElement;
|
|
|
+
|
|
|
+ // 提取表格
|
|
|
+ ContentElement tableElement = extractTable(table, tableIndex++, elementIndex++);
|
|
|
+ elements.add(tableElement);
|
|
|
+ fullText.append("[表格 ").append(tableIndex).append("]\n");
|
|
|
+ fullText.append(tableElement.getTableText()).append("\n");
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ result.setElements(elements);
|
|
|
+ result.setFullText(fullText.toString());
|
|
|
+ result.setImageCount(imageIndex);
|
|
|
+ result.setTableCount(tableIndex);
|
|
|
+ result.setTotalElements(elements.size());
|
|
|
+
|
|
|
+ log.info("结构化提取完成: elements={}, images={}, tables={}",
|
|
|
+ elements.size(), imageIndex, tableIndex);
|
|
|
+ }
|
|
|
+
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 提取图片
|
|
|
+ */
|
|
|
+ private ContentElement extractImage(XWPFPicture picture, String imageDir,
|
|
|
+ String documentId, int imageIndex, int elementIndex) {
|
|
|
+ try {
|
|
|
+ XWPFPictureData pictureData = picture.getPictureData();
|
|
|
+ if (pictureData == null) {
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+
|
|
|
+ String extension = pictureData.suggestFileExtension();
|
|
|
+ String imageName = String.format("image_%03d.%s", imageIndex, extension);
|
|
|
+ String imagePath = imageDir + "/" + imageName;
|
|
|
+
|
|
|
+ // 保存图片
|
|
|
+ try (FileOutputStream fos = new FileOutputStream(imagePath)) {
|
|
|
+ fos.write(pictureData.getData());
|
|
|
+ }
|
|
|
+
|
|
|
+ // 获取图片尺寸
|
|
|
+ int width = 0;
|
|
|
+ int height = 0;
|
|
|
+ try {
|
|
|
+ // 尝试从 CTDrawing 获取尺寸(EMU 单位)
|
|
|
+ width = (int) (picture.getCTPicture().getSpPr().getXfrm().getExt().getCx() / Units.EMU_PER_PIXEL);
|
|
|
+ height = (int) (picture.getCTPicture().getSpPr().getXfrm().getExt().getCy() / Units.EMU_PER_PIXEL);
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.debug("无法获取图片尺寸: {}", e.getMessage());
|
|
|
+ }
|
|
|
+
|
|
|
+ ContentElement element = new ContentElement();
|
|
|
+ element.setIndex(elementIndex);
|
|
|
+ element.setType("image");
|
|
|
+ element.setContent(null);
|
|
|
+ element.setImageUrl("/api/v1/files/images/" + documentId + "/" + imageName);
|
|
|
+ element.setImagePath(imagePath);
|
|
|
+ element.setImageAlt(picture.getDescription() != null ? picture.getDescription() : "图片 " + (imageIndex + 1));
|
|
|
+ element.setImageWidth(width > 0 ? width : null);
|
|
|
+ element.setImageHeight(height > 0 ? height : null);
|
|
|
+ element.setImageFormat(extension);
|
|
|
+
|
|
|
+ log.debug("提取图片: index={}, path={}, size={}x{}", imageIndex, imagePath, width, height);
|
|
|
+ return element;
|
|
|
+
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.warn("提取图片失败: {}", e.getMessage());
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 提取表格
|
|
|
+ */
|
|
|
+ private ContentElement extractTable(XWPFTable table, int tableIndex, int elementIndex) {
|
|
|
+ ContentElement element = new ContentElement();
|
|
|
+ element.setIndex(elementIndex);
|
|
|
+ element.setType("table");
|
|
|
+ element.setTableIndex(tableIndex);
|
|
|
+
|
|
|
+ List<List<TableCell>> rows = new ArrayList<>();
|
|
|
+ StringBuilder tableText = new StringBuilder();
|
|
|
+
|
|
|
+ int rowCount = table.getNumberOfRows();
|
|
|
+ int colCount = 0;
|
|
|
+
|
|
|
+ for (int i = 0; i < rowCount; i++) {
|
|
|
+ XWPFTableRow row = table.getRow(i);
|
|
|
+ List<XWPFTableCell> cells = row.getTableCells();
|
|
|
+ List<TableCell> rowData = new ArrayList<>();
|
|
|
+
|
|
|
+ colCount = Math.max(colCount, cells.size());
|
|
|
+
|
|
|
+ for (int j = 0; j < cells.size(); j++) {
|
|
|
+ XWPFTableCell cell = cells.get(j);
|
|
|
+ TableCell cellData = new TableCell();
|
|
|
+ cellData.setRow(i);
|
|
|
+ cellData.setCol(j);
|
|
|
+ cellData.setText(cell.getText());
|
|
|
+
|
|
|
+ // 提取单元格样式
|
|
|
+ try {
|
|
|
+ if (cell.getCTTc().getTcPr() != null) {
|
|
|
+ if (cell.getCTTc().getTcPr().getGridSpan() != null) {
|
|
|
+ cellData.setColSpan(cell.getCTTc().getTcPr().getGridSpan().getVal().intValue());
|
|
|
+ }
|
|
|
+ if (cell.getCTTc().getTcPr().getVMerge() != null) {
|
|
|
+ cellData.setMerged(true);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } catch (Exception e) {
|
|
|
+ // 忽略样式提取错误
|
|
|
+ }
|
|
|
+
|
|
|
+ rowData.add(cellData);
|
|
|
+ tableText.append(cell.getText());
|
|
|
+ if (j < cells.size() - 1) {
|
|
|
+ tableText.append("\t");
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ rows.add(rowData);
|
|
|
+ tableText.append("\n");
|
|
|
+ }
|
|
|
+
|
|
|
+ element.setTableRows(rows);
|
|
|
+ element.setTableRowCount(rowCount);
|
|
|
+ element.setTableColCount(colCount);
|
|
|
+ element.setTableText(tableText.toString().trim());
|
|
|
+
|
|
|
+ log.debug("提取表格: index={}, rows={}, cols={}", tableIndex, rowCount, colCount);
|
|
|
+ return element;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 检测段落类型
|
|
|
+ */
|
|
|
+ private String detectParagraphType(XWPFParagraph paragraph, String text) {
|
|
|
+ // 检查样式名称
|
|
|
+ String styleName = paragraph.getStyle();
|
|
|
+ if (styleName != null) {
|
|
|
+ String lowerStyle = styleName.toLowerCase();
|
|
|
+ if (lowerStyle.contains("heading") || lowerStyle.contains("标题")) {
|
|
|
+ return "heading";
|
|
|
+ }
|
|
|
+ if (lowerStyle.contains("title")) {
|
|
|
+ return "title";
|
|
|
+ }
|
|
|
+ if (lowerStyle.contains("toc")) {
|
|
|
+ return "toc";
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 检查大纲级别
|
|
|
+ if (paragraph.getCTP().getPPr() != null &&
|
|
|
+ paragraph.getCTP().getPPr().getOutlineLvl() != null) {
|
|
|
+ int level = paragraph.getCTP().getPPr().getOutlineLvl().getVal().intValue();
|
|
|
+ if (level >= 0 && level <= 9) {
|
|
|
+ return "heading" + (level + 1);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 基于内容检测
|
|
|
+ String trimmed = text.trim();
|
|
|
+ if (trimmed.length() < 100) {
|
|
|
+ if (trimmed.matches("^[一二三四五六七八九十]+[、.].*") ||
|
|
|
+ trimmed.matches("^第[一二三四五六七八九十]+[章节部分].*") ||
|
|
|
+ trimmed.matches("^\\d+\\.\\d*\\s+.*")) {
|
|
|
+ return "heading";
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 检测列表
|
|
|
+ if (trimmed.matches("^[•·\\-\\*]\\s+.*") ||
|
|
|
+ trimmed.matches("^\\d+[.、)]\\s+.*") ||
|
|
|
+ trimmed.matches("^[a-zA-Z][.)]\\s+.*")) {
|
|
|
+ return "list_item";
|
|
|
+ }
|
|
|
+
|
|
|
+ return "paragraph";
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 提取段落样式
|
|
|
+ */
|
|
|
+ private Map<String, Object> extractParagraphStyle(XWPFParagraph paragraph) {
|
|
|
+ Map<String, Object> style = new HashMap<>();
|
|
|
+
|
|
|
+ try {
|
|
|
+ // 对齐方式
|
|
|
+ ParagraphAlignment alignment = paragraph.getAlignment();
|
|
|
+ if (alignment != null) {
|
|
|
+ style.put("alignment", alignment.name().toLowerCase());
|
|
|
+ }
|
|
|
+
|
|
|
+ // 缩进
|
|
|
+ int indentLeft = paragraph.getIndentationLeft();
|
|
|
+ if (indentLeft > 0) {
|
|
|
+ style.put("indentLeft", indentLeft);
|
|
|
+ }
|
|
|
+ int indentFirstLine = paragraph.getIndentationFirstLine();
|
|
|
+ if (indentFirstLine > 0) {
|
|
|
+ style.put("indentFirstLine", indentFirstLine);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 行距
|
|
|
+ if (paragraph.getSpacingBetween() > 0) {
|
|
|
+ style.put("lineSpacing", paragraph.getSpacingBetween());
|
|
|
+ }
|
|
|
+
|
|
|
+ // 字体(从第一个 run 获取)
|
|
|
+ List<XWPFRun> runs = paragraph.getRuns();
|
|
|
+ if (!runs.isEmpty()) {
|
|
|
+ XWPFRun firstRun = runs.get(0);
|
|
|
+ if (firstRun.getFontFamily() != null) {
|
|
|
+ style.put("fontFamily", firstRun.getFontFamily());
|
|
|
+ }
|
|
|
+ if (firstRun.getFontSizeAsDouble() != null && firstRun.getFontSizeAsDouble() > 0) {
|
|
|
+ style.put("fontSize", firstRun.getFontSizeAsDouble());
|
|
|
+ }
|
|
|
+ if (firstRun.isBold()) {
|
|
|
+ style.put("bold", true);
|
|
|
+ }
|
|
|
+ if (firstRun.isItalic()) {
|
|
|
+ style.put("italic", true);
|
|
|
+ }
|
|
|
+ if (firstRun.getUnderline() != UnderlinePatterns.NONE) {
|
|
|
+ style.put("underline", true);
|
|
|
+ }
|
|
|
+ if (firstRun.getColor() != null) {
|
|
|
+ style.put("color", firstRun.getColor());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.debug("提取段落样式失败: {}", e.getMessage());
|
|
|
+ }
|
|
|
+
|
|
|
+ return style.isEmpty() ? null : style;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 结构化提取结果
|
|
|
+ */
|
|
|
+ @Data
|
|
|
+ public static class WordStructuredResult {
|
|
|
+ private String documentId;
|
|
|
+ private List<ContentElement> elements;
|
|
|
+ private String fullText;
|
|
|
+ private int imageCount;
|
|
|
+ private int tableCount;
|
|
|
+ private int totalElements;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 内容元素
|
|
|
+ */
|
|
|
+ @Data
|
|
|
+ public static class ContentElement {
|
|
|
+ private int index; // 元素在文档中的顺序索引
|
|
|
+ private String type; // paragraph/heading/heading1-9/list_item/image/table/title/toc
|
|
|
+ private String content; // 文本内容(仅文本类型)
|
|
|
+ private Map<String, Object> style; // 样式信息
|
|
|
+
|
|
|
+ // 图片相关
|
|
|
+ private String imageUrl; // 图片访问 URL
|
|
|
+ private String imagePath; // 图片存储路径
|
|
|
+ private String imageAlt; // 图片描述
|
|
|
+ private Integer imageWidth; // 图片宽度(像素)
|
|
|
+ private Integer imageHeight; // 图片高度(像素)
|
|
|
+ private String imageFormat; // 图片格式
|
|
|
+
|
|
|
+ // 表格相关
|
|
|
+ private Integer tableIndex; // 表格索引
|
|
|
+ private List<List<TableCell>> tableRows; // 表格行数据
|
|
|
+ private Integer tableRowCount; // 行数
|
|
|
+ private Integer tableColCount; // 列数
|
|
|
+ private String tableText; // 表格文本(用于搜索)
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 表格单元格
|
|
|
+ */
|
|
|
+ @Data
|
|
|
+ public static class TableCell {
|
|
|
+ private int row;
|
|
|
+ private int col;
|
|
|
+ private String text;
|
|
|
+ private Integer colSpan; // 列合并数
|
|
|
+ private Integer rowSpan; // 行合并数
|
|
|
+ private boolean merged; // 是否为合并单元格
|
|
|
+ }
|
|
|
+}
|