|
|
@@ -103,7 +103,7 @@ public class WordStructuredExtractionService {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- // 提取段落文本
|
|
|
+ // 提取段落文本及格式
|
|
|
String text = paragraph.getText();
|
|
|
if (text != null && !text.trim().isEmpty()) {
|
|
|
ContentElement textElement = new ContentElement();
|
|
|
@@ -111,6 +111,8 @@ public class WordStructuredExtractionService {
|
|
|
textElement.setType(detectParagraphType(paragraph, text));
|
|
|
textElement.setContent(text.trim());
|
|
|
textElement.setStyle(extractParagraphStyle(paragraph));
|
|
|
+ // 逐 Run 提取格式
|
|
|
+ textElement.setRuns(extractTextRuns(paragraph.getRuns()));
|
|
|
|
|
|
elements.add(textElement);
|
|
|
fullText.append(text).append("\n");
|
|
|
@@ -219,6 +221,8 @@ public class WordStructuredExtractionService {
|
|
|
cellData.setRow(i);
|
|
|
cellData.setCol(j);
|
|
|
cellData.setText(cell.getText());
|
|
|
+ // 提取单元格内的文本格式
|
|
|
+ cellData.setRuns(extractCellRuns(cell));
|
|
|
|
|
|
// 提取单元格样式
|
|
|
try {
|
|
|
@@ -360,6 +364,144 @@ public class WordStructuredExtractionService {
|
|
|
return style.isEmpty() ? null : style;
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * 提取段落中所有 Run 的文本和样式
|
|
|
+ */
|
|
|
+ private List<TextRun> extractTextRuns(List<XWPFRun> xwpfRuns) {
|
|
|
+ if (xwpfRuns == null || xwpfRuns.isEmpty()) {
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+
|
|
|
+ List<TextRun> runs = new ArrayList<>();
|
|
|
+ for (XWPFRun xwpfRun : xwpfRuns) {
|
|
|
+ String text = xwpfRun.text();
|
|
|
+ if (text == null || text.isEmpty()) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ TextRun run = new TextRun();
|
|
|
+ run.setText(text);
|
|
|
+
|
|
|
+ try {
|
|
|
+ // 字体
|
|
|
+ if (xwpfRun.getFontFamily() != null) {
|
|
|
+ run.setFontFamily(xwpfRun.getFontFamily());
|
|
|
+ }
|
|
|
+
|
|
|
+ // 字号
|
|
|
+ Double fontSize = xwpfRun.getFontSizeAsDouble();
|
|
|
+ if (fontSize != null && fontSize > 0) {
|
|
|
+ run.setFontSize(fontSize);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 加粗
|
|
|
+ if (xwpfRun.isBold()) {
|
|
|
+ run.setBold(true);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 斜体
|
|
|
+ if (xwpfRun.isItalic()) {
|
|
|
+ run.setItalic(true);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 下划线
|
|
|
+ UnderlinePatterns underline = xwpfRun.getUnderline();
|
|
|
+ if (underline != null && underline != UnderlinePatterns.NONE) {
|
|
|
+ run.setUnderline(underline.name().toLowerCase());
|
|
|
+ }
|
|
|
+
|
|
|
+ // 颜色
|
|
|
+ String color = xwpfRun.getColor();
|
|
|
+ if (color != null && !color.isEmpty()) {
|
|
|
+ run.setColor(color);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 删除线
|
|
|
+ if (xwpfRun.isStrikeThrough() || xwpfRun.isDoubleStrikeThrough()) {
|
|
|
+ run.setStrikeThrough(true);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 上下标
|
|
|
+ VerticalAlign vertAlign = xwpfRun.getSubscript();
|
|
|
+ if (vertAlign != null && vertAlign != VerticalAlign.BASELINE) {
|
|
|
+ run.setVerticalAlign(vertAlign.name().toLowerCase());
|
|
|
+ }
|
|
|
+
|
|
|
+ // 高亮颜色
|
|
|
+ if (xwpfRun.isHighlighted()) {
|
|
|
+ try {
|
|
|
+ String highlightColor = xwpfRun.getTextHighlightColor().name().toLowerCase();
|
|
|
+ run.setHighlightColor(highlightColor);
|
|
|
+ } catch (Exception e) {
|
|
|
+ run.setHighlightColor("yellow");
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } catch (Exception e) {
|
|
|
+ log.debug("提取 Run 样式失败: {}", e.getMessage());
|
|
|
+ }
|
|
|
+
|
|
|
+ runs.add(run);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 合并相邻的相同样式 Run(优化)
|
|
|
+ return mergeAdjacentRuns(runs);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 合并相邻的相同样式 Run
|
|
|
+ */
|
|
|
+ private List<TextRun> mergeAdjacentRuns(List<TextRun> runs) {
|
|
|
+ if (runs == null || runs.size() <= 1) {
|
|
|
+ return runs;
|
|
|
+ }
|
|
|
+
|
|
|
+ List<TextRun> merged = new ArrayList<>();
|
|
|
+ TextRun current = runs.get(0);
|
|
|
+
|
|
|
+ for (int i = 1; i < runs.size(); i++) {
|
|
|
+ TextRun next = runs.get(i);
|
|
|
+ if (isSameStyle(current, next)) {
|
|
|
+ // 合并文本
|
|
|
+ current.setText(current.getText() + next.getText());
|
|
|
+ } else {
|
|
|
+ merged.add(current);
|
|
|
+ current = next;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ merged.add(current);
|
|
|
+
|
|
|
+ return merged;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 判断两个 Run 样式是否相同
|
|
|
+ */
|
|
|
+ private boolean isSameStyle(TextRun r1, TextRun r2) {
|
|
|
+ return Objects.equals(r1.getFontFamily(), r2.getFontFamily())
|
|
|
+ && Objects.equals(r1.getFontSize(), r2.getFontSize())
|
|
|
+ && Objects.equals(r1.getBold(), r2.getBold())
|
|
|
+ && Objects.equals(r1.getItalic(), r2.getItalic())
|
|
|
+ && Objects.equals(r1.getUnderline(), r2.getUnderline())
|
|
|
+ && Objects.equals(r1.getColor(), r2.getColor())
|
|
|
+ && Objects.equals(r1.getStrikeThrough(), r2.getStrikeThrough())
|
|
|
+ && Objects.equals(r1.getVerticalAlign(), r2.getVerticalAlign())
|
|
|
+ && Objects.equals(r1.getHighlightColor(), r2.getHighlightColor());
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 从单元格中提取 Run
|
|
|
+ */
|
|
|
+ private List<TextRun> extractCellRuns(XWPFTableCell cell) {
|
|
|
+ List<TextRun> allRuns = new ArrayList<>();
|
|
|
+ for (XWPFParagraph para : cell.getParagraphs()) {
|
|
|
+ List<TextRun> runs = extractTextRuns(para.getRuns());
|
|
|
+ if (runs != null) {
|
|
|
+ allRuns.addAll(runs);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return allRuns.isEmpty() ? null : allRuns;
|
|
|
+ }
|
|
|
+
|
|
|
/**
|
|
|
* 结构化提取结果
|
|
|
*/
|
|
|
@@ -381,7 +523,8 @@ public class WordStructuredExtractionService {
|
|
|
private int index; // 元素在文档中的顺序索引
|
|
|
private String type; // paragraph/heading/heading1-9/list_item/image/table/title/toc
|
|
|
private String content; // 文本内容(仅文本类型)
|
|
|
- private Map<String, Object> style; // 样式信息
|
|
|
+ private Map<String, Object> style; // 段落级样式信息
|
|
|
+ private List<TextRun> runs; // 文本片段列表(保留格式)
|
|
|
|
|
|
// 图片相关
|
|
|
private String imageUrl; // 图片访问 URL
|
|
|
@@ -399,6 +542,23 @@ public class WordStructuredExtractionService {
|
|
|
private String tableText; // 表格文本(用于搜索)
|
|
|
}
|
|
|
|
|
|
+ /**
|
|
|
+ * 文本片段(Run),保留字符级样式
|
|
|
+ */
|
|
|
+ @Data
|
|
|
+ public static class TextRun {
|
|
|
+ private String text; // 文本内容
|
|
|
+ private String fontFamily; // 字体名称
|
|
|
+ private Double fontSize; // 字号(磅)
|
|
|
+ private Boolean bold; // 加粗
|
|
|
+ private Boolean italic; // 斜体
|
|
|
+ private String underline; // 下划线类型:single/double/wave/dotted/dashed/none
|
|
|
+ private String color; // 字体颜色(十六进制)
|
|
|
+ private Boolean strikeThrough; // 删除线
|
|
|
+ private String verticalAlign; // 垂直对齐:baseline/superscript/subscript
|
|
|
+ private String highlightColor; // 高亮颜色
|
|
|
+ }
|
|
|
+
|
|
|
/**
|
|
|
* 表格单元格
|
|
|
*/
|
|
|
@@ -407,6 +567,7 @@ public class WordStructuredExtractionService {
|
|
|
private int row;
|
|
|
private int col;
|
|
|
private String text;
|
|
|
+ private List<TextRun> runs; // 单元格内的文本片段
|
|
|
private Integer colSpan; // 列合并数
|
|
|
private Integer rowSpan; // 行合并数
|
|
|
private boolean merged; // 是否为合并单元格
|