|
@@ -46,6 +46,16 @@ public class TextChunkService {
|
|
|
}
|
|
}
|
|
|
log.info("开始文本分块: documentId={}, textLength={}", documentId, text.length());
|
|
log.info("开始文本分块: documentId={}, textLength={}", documentId, text.length());
|
|
|
|
|
|
|
|
|
|
+ int safeChunkOverlap = chunkOverlap;
|
|
|
|
|
+ if (safeChunkOverlap < 0) {
|
|
|
|
|
+ log.warn("分块重叠小于 0,已修正为 0: overlap={}", chunkOverlap);
|
|
|
|
|
+ safeChunkOverlap = 0;
|
|
|
|
|
+ }
|
|
|
|
|
+ if (safeChunkOverlap >= chunkSize) {
|
|
|
|
|
+ log.warn("分块重叠过大,已修正为 size-1: overlap={}, size={}", chunkOverlap, chunkSize);
|
|
|
|
|
+ safeChunkOverlap = Math.max(0, chunkSize - 1);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
// 先删除已有的分块
|
|
// 先删除已有的分块
|
|
|
textChunkRepository.deleteByDocumentId(documentId);
|
|
textChunkRepository.deleteByDocumentId(documentId);
|
|
|
|
|
|
|
@@ -53,8 +63,16 @@ public class TextChunkService {
|
|
|
int start = 0;
|
|
int start = 0;
|
|
|
int chunkIndex = 0;
|
|
int chunkIndex = 0;
|
|
|
int lastReportedChunk = 0;
|
|
int lastReportedChunk = 0;
|
|
|
|
|
+ int prevStart = -1;
|
|
|
|
|
+ int maxChunks = Math.max(1, (text.length() / Math.max(1, (chunkSize - safeChunkOverlap))) + 2);
|
|
|
|
|
|
|
|
while (start < text.length()) {
|
|
while (start < text.length()) {
|
|
|
|
|
+ if (chunks.size() > maxChunks) {
|
|
|
|
|
+ log.warn("分块数量异常,提前终止: documentId={}, chunks={}, textLength={}, chunkSize={}, overlap={}",
|
|
|
|
|
+ documentId, chunks.size(), text.length(), chunkSize, chunkOverlap);
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
int end = Math.min(start + chunkSize, text.length());
|
|
int end = Math.min(start + chunkSize, text.length());
|
|
|
|
|
|
|
|
// 尝试在句子边界分割(避免截断句子)
|
|
// 尝试在句子边界分割(避免截断句子)
|
|
@@ -95,8 +113,19 @@ public class TextChunkService {
|
|
|
textChunkRepository.insert(chunk);
|
|
textChunkRepository.insert(chunk);
|
|
|
chunks.add(chunk);
|
|
chunks.add(chunk);
|
|
|
|
|
|
|
|
|
|
+ if (end >= text.length()) {
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
// 移动起始位置(考虑重叠)
|
|
// 移动起始位置(考虑重叠)
|
|
|
- start = end - chunkOverlap;
|
|
|
|
|
|
|
+ start = end - safeChunkOverlap;
|
|
|
|
|
+ if (start <= prevStart) {
|
|
|
|
|
+ log.warn("分块起点未推进,修正并终止: documentId={}, prevStart={}, start={}, end={}, overlap={}",
|
|
|
|
|
+ documentId, prevStart, start, end, chunkOverlap);
|
|
|
|
|
+ start = end;
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ prevStart = start;
|
|
|
if (start >= text.length() || start <= 0) {
|
|
if (start >= text.length() || start <= 0) {
|
|
|
break;
|
|
break;
|
|
|
}
|
|
}
|