|
|
@@ -27,7 +27,13 @@ public class DocumentIndexService {
|
|
|
private final ObjectMapper objectMapper;
|
|
|
|
|
|
/**
|
|
|
- * 为纯文本生成行索引(Word/Excel等无分页文档)
|
|
|
+ * 分页符字符(Form Feed)
|
|
|
+ */
|
|
|
+ private static final char FORM_FEED = '\f';
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 为纯文本生成行索引(Word/Excel等文档)
|
|
|
+ * 如果文本中包含分页符(\f),则根据分页符生成页面索引
|
|
|
*
|
|
|
* @param text 文本内容
|
|
|
* @param documentId 文档ID
|
|
|
@@ -39,6 +45,118 @@ public class DocumentIndexService {
|
|
|
return createEmptyIndex(documentId);
|
|
|
}
|
|
|
|
|
|
+ // 检查是否包含分页符
|
|
|
+ boolean hasPageBreaks = text.indexOf(FORM_FEED) >= 0;
|
|
|
+
|
|
|
+ if (hasPageBreaks) {
|
|
|
+ log.info("检测到分页符,生成分页索引: documentId={}", documentId);
|
|
|
+ return generateIndexWithPageBreaks(text, documentId, indexOutputPath);
|
|
|
+ } else {
|
|
|
+ log.debug("无分页符,生成单页索引: documentId={}", documentId);
|
|
|
+ return generateSinglePageIndex(text, documentId, indexOutputPath);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 根据分页符生成多页索引
|
|
|
+ */
|
|
|
+ private DocumentIndex generateIndexWithPageBreaks(String text, String documentId, String indexOutputPath) {
|
|
|
+ List<PageIndex> pageIndices = new ArrayList<>();
|
|
|
+ List<LineIndex> lineIndices = new ArrayList<>();
|
|
|
+
|
|
|
+ int charPos = 0;
|
|
|
+ int lineNum = 1;
|
|
|
+ int pageNum = 1;
|
|
|
+ int pageCharStart = 0;
|
|
|
+ int pageLineStart = 1;
|
|
|
+
|
|
|
+ String[] lines = text.split("\n", -1);
|
|
|
+
|
|
|
+ for (String line : lines) {
|
|
|
+ // 检查这一行是否包含分页符
|
|
|
+ int ffIndex = line.indexOf(FORM_FEED);
|
|
|
+
|
|
|
+ if (ffIndex >= 0) {
|
|
|
+ // 处理分页符之前的内容
|
|
|
+ if (ffIndex > 0) {
|
|
|
+ LineIndex lineIndex = new LineIndex();
|
|
|
+ lineIndex.setLine(lineNum);
|
|
|
+ lineIndex.setCharStart(charPos);
|
|
|
+ lineIndex.setCharEnd(charPos + ffIndex);
|
|
|
+ lineIndices.add(lineIndex);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 结束当前页
|
|
|
+ PageIndex pageIndex = new PageIndex();
|
|
|
+ pageIndex.setPage(pageNum);
|
|
|
+ pageIndex.setCharStart(pageCharStart);
|
|
|
+ pageIndex.setCharEnd(charPos + ffIndex);
|
|
|
+ pageIndex.setLineStart(pageLineStart);
|
|
|
+ pageIndex.setLineEnd(lineNum);
|
|
|
+ pageIndices.add(pageIndex);
|
|
|
+
|
|
|
+ // 开始新页
|
|
|
+ pageNum++;
|
|
|
+ pageCharStart = charPos + ffIndex + 1; // +1 跳过分页符
|
|
|
+ pageLineStart = lineNum + 1;
|
|
|
+
|
|
|
+ // 处理分页符之后的内容(如果有)
|
|
|
+ if (ffIndex + 1 < line.length()) {
|
|
|
+ lineNum++;
|
|
|
+ LineIndex afterLineIndex = new LineIndex();
|
|
|
+ afterLineIndex.setLine(lineNum);
|
|
|
+ afterLineIndex.setCharStart(charPos + ffIndex + 1);
|
|
|
+ afterLineIndex.setCharEnd(charPos + line.length());
|
|
|
+ lineIndices.add(afterLineIndex);
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ // 普通行
|
|
|
+ LineIndex lineIndex = new LineIndex();
|
|
|
+ lineIndex.setLine(lineNum);
|
|
|
+ lineIndex.setCharStart(charPos);
|
|
|
+ lineIndex.setCharEnd(charPos + line.length());
|
|
|
+ lineIndices.add(lineIndex);
|
|
|
+ }
|
|
|
+
|
|
|
+ charPos += line.length() + 1; // +1 for \n
|
|
|
+ lineNum++;
|
|
|
+ }
|
|
|
+
|
|
|
+ // 添加最后一页
|
|
|
+ if (pageCharStart < text.length()) {
|
|
|
+ PageIndex lastPage = new PageIndex();
|
|
|
+ lastPage.setPage(pageNum);
|
|
|
+ lastPage.setCharStart(pageCharStart);
|
|
|
+ lastPage.setCharEnd(text.length());
|
|
|
+ lastPage.setLineStart(pageLineStart);
|
|
|
+ lastPage.setLineEnd(lineNum - 1);
|
|
|
+ pageIndices.add(lastPage);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 创建文档索引
|
|
|
+ DocumentIndex index = new DocumentIndex();
|
|
|
+ index.setDocumentId(documentId);
|
|
|
+ index.setTotalChars(text.length());
|
|
|
+ index.setTotalLines(lineNum - 1);
|
|
|
+ index.setTotalPages(pageIndices.size());
|
|
|
+ index.setPages(pageIndices);
|
|
|
+ index.setLines(lineIndices);
|
|
|
+
|
|
|
+ log.info("分页索引生成完成: documentId={}, pages={}, lines={}",
|
|
|
+ documentId, pageIndices.size(), lineIndices.size());
|
|
|
+
|
|
|
+ // 保存索引文件
|
|
|
+ if (indexOutputPath != null) {
|
|
|
+ saveIndexFile(index, indexOutputPath);
|
|
|
+ }
|
|
|
+
|
|
|
+ return index;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 生成单页索引(无分页符的文档)
|
|
|
+ */
|
|
|
+ private DocumentIndex generateSinglePageIndex(String text, String documentId, String indexOutputPath) {
|
|
|
List<LineIndex> lineIndices = new ArrayList<>();
|
|
|
int charPos = 0;
|
|
|
int lineNum = 1;
|