Skip to content

Commit ec6fa58

Browse files
committed
fix(chunkers): fall back to character-level overlap in sentence chunker
When no complete sentence fits within the overlap budget, fall back to character-level word-boundary overlap from the previous group's text. This ensures buildChunks metadata is always correct.
1 parent a53f760 commit ec6fa58

1 file changed

Lines changed: 12 additions & 2 deletions

File tree

apps/sim/lib/chunkers/sentence-chunker.ts

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,10 +119,20 @@ export class SentenceChunker {
119119
overlapLen += prevGroup[j].length
120120
}
121121

122+
const currentText = groups[i].join(' ')
122123
if (overlapSentences.length > 0) {
123-
result.push(`${overlapSentences.join(' ')} ${groups[i].join(' ')}`)
124+
result.push(`${overlapSentences.join(' ')} ${currentText}`)
124125
} else {
125-
result.push(groups[i].join(' '))
126+
// No complete sentence fits — fall back to character-level overlap
127+
const prevText = prevGroup.join(' ')
128+
const tail = prevText.slice(-overlapChars)
129+
const wordMatch = tail.match(/^\s*\S/)
130+
const cleanTail = wordMatch ? tail.slice(tail.indexOf(wordMatch[0].trim())) : tail
131+
if (cleanTail.trim()) {
132+
result.push(`${cleanTail.trim()} ${currentText}`)
133+
} else {
134+
result.push(currentText)
135+
}
126136
}
127137
}
128138

0 commit comments

Comments
 (0)