|
@@ -276,6 +276,7 @@ public class PdfTextExtractionService {
|
|
|
String indexOutputPath) {
|
|
String indexOutputPath) {
|
|
|
StringBuilder combinedText = new StringBuilder();
|
|
StringBuilder combinedText = new StringBuilder();
|
|
|
List<PageIndex> pageIndices = new ArrayList<>();
|
|
List<PageIndex> pageIndices = new ArrayList<>();
|
|
|
|
|
+ List<LineIndex> lineIndices = new ArrayList<>();
|
|
|
|
|
|
|
|
int currentCharPos = 0;
|
|
int currentCharPos = 0;
|
|
|
int currentLine = 1;
|
|
int currentLine = 1;
|
|
@@ -294,18 +295,35 @@ public class PdfTextExtractionService {
|
|
|
}
|
|
}
|
|
|
pageHeader += " ===\n";
|
|
pageHeader += " ===\n";
|
|
|
|
|
|
|
|
|
|
+ // 记录页头行的索引
|
|
|
|
|
+ LineIndex headerLineIndex = new LineIndex();
|
|
|
|
|
+ headerLineIndex.setLine(currentLine);
|
|
|
|
|
+ headerLineIndex.setCharStart(currentCharPos);
|
|
|
|
|
+ headerLineIndex.setCharEnd(currentCharPos + pageHeader.length() - 2); // -2 去掉 \n
|
|
|
|
|
+ lineIndices.add(headerLineIndex);
|
|
|
|
|
+
|
|
|
combinedText.append(pageHeader);
|
|
combinedText.append(pageHeader);
|
|
|
currentCharPos += pageHeader.length();
|
|
currentCharPos += pageHeader.length();
|
|
|
currentLine++;
|
|
currentLine++;
|
|
|
|
|
|
|
|
- // 添加页面内容
|
|
|
|
|
|
|
+ // 添加页面内容并记录每行索引
|
|
|
String pageText = result.getText();
|
|
String pageText = result.getText();
|
|
|
- combinedText.append(pageText).append("\n\n");
|
|
|
|
|
|
|
+ String[] lines = pageText.split("\n", -1);
|
|
|
|
|
+ for (String line : lines) {
|
|
|
|
|
+ LineIndex lineIndex = new LineIndex();
|
|
|
|
|
+ lineIndex.setLine(currentLine);
|
|
|
|
|
+ lineIndex.setCharStart(currentCharPos);
|
|
|
|
|
+ lineIndex.setCharEnd(currentCharPos + line.length());
|
|
|
|
|
+ lineIndices.add(lineIndex);
|
|
|
|
|
+
|
|
|
|
|
+ currentCharPos += line.length() + 1; // +1 for \n
|
|
|
|
|
+ currentLine++;
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
- // 计算行数
|
|
|
|
|
- int pageLines = countLines(pageText);
|
|
|
|
|
- currentLine += pageLines + 1; // +1 for the empty line separator
|
|
|
|
|
- currentCharPos += pageText.length() + 2; // +2 for "\n\n"
|
|
|
|
|
|
|
+ // 添加额外的空行分隔符
|
|
|
|
|
+ combinedText.append(pageText).append("\n\n");
|
|
|
|
|
+ currentCharPos++; // 额外的 \n
|
|
|
|
|
+ currentLine++;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// 创建页面索引
|
|
// 创建页面索引
|
|
@@ -323,6 +341,7 @@ public class PdfTextExtractionService {
|
|
|
DocumentIndex documentIndex = new DocumentIndex();
|
|
DocumentIndex documentIndex = new DocumentIndex();
|
|
|
documentIndex.setDocumentId(documentId);
|
|
documentIndex.setDocumentId(documentId);
|
|
|
documentIndex.setPages(pageIndices);
|
|
documentIndex.setPages(pageIndices);
|
|
|
|
|
+ documentIndex.setLines(lineIndices);
|
|
|
documentIndex.setTotalChars(currentCharPos);
|
|
documentIndex.setTotalChars(currentCharPos);
|
|
|
documentIndex.setTotalLines(currentLine - 1);
|
|
documentIndex.setTotalLines(currentLine - 1);
|
|
|
documentIndex.setTotalPages(pageResults.size());
|
|
documentIndex.setTotalPages(pageResults.size());
|
|
@@ -338,22 +357,6 @@ public class PdfTextExtractionService {
|
|
|
return result;
|
|
return result;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- /**
|
|
|
|
|
- * 计算文本行数
|
|
|
|
|
- */
|
|
|
|
|
- private int countLines(String text) {
|
|
|
|
|
- if (text == null || text.isEmpty()) {
|
|
|
|
|
- return 0;
|
|
|
|
|
- }
|
|
|
|
|
- int lines = 1;
|
|
|
|
|
- for (char c : text.toCharArray()) {
|
|
|
|
|
- if (c == '\n') {
|
|
|
|
|
- lines++;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- return lines;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
/**
|
|
/**
|
|
|
* 保存索引文件
|
|
* 保存索引文件
|
|
|
*/
|
|
*/
|
|
@@ -413,11 +416,22 @@ public class PdfTextExtractionService {
|
|
|
public static class DocumentIndex {
|
|
public static class DocumentIndex {
|
|
|
private String documentId;
|
|
private String documentId;
|
|
|
private List<PageIndex> pages;
|
|
private List<PageIndex> pages;
|
|
|
|
|
+ private List<LineIndex> lines;
|
|
|
private int totalChars;
|
|
private int totalChars;
|
|
|
private int totalLines;
|
|
private int totalLines;
|
|
|
private int totalPages;
|
|
private int totalPages;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ /**
|
|
|
|
|
+ * 行索引
|
|
|
|
|
+ */
|
|
|
|
|
+ @Data
|
|
|
|
|
+ public static class LineIndex {
|
|
|
|
|
+ private int line;
|
|
|
|
|
+ private int charStart;
|
|
|
|
|
+ private int charEnd;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
/**
|
|
/**
|
|
|
* 页面索引
|
|
* 页面索引
|
|
|
*/
|
|
*/
|