[Bugfix][Rust Frontend] Fix UTF-8 char-boundary panic in incremental detokenizer (#44620)

Signed-off-by: Ting Sun <suntcrick@gmail.com>
This commit is contained in:
Ting SUN
2026-06-05 15:36:17 +08:00
committed by GitHub
parent ef3af56a97
commit ca73293fa6
+25
View File
@@ -115,6 +115,8 @@ impl<T: Tokenizer + ?Sized> IncrementalDecoder for DecodeStream<'_, T> {
fn next_chunk(&mut self) -> Option<String> {
let cutoff = self.cumulative_output.len().saturating_sub(self.min_bytes_to_buffer);
// Ensure we split at a utf-8 char boundary.
let cutoff = self.cumulative_output.floor_char_boundary(cutoff);
(cutoff > self.output_index).then(|| {
let chunk = self.cumulative_output[self.output_index..cutoff].to_string();
self.output_index = cutoff;
@@ -356,4 +358,27 @@ mod tests {
assert_eq!(last_chunk.as_deref(), Some("lo!"));
assert_eq!(full_text, "Hello!");
}
#[test]
fn next_chunk_cutoff_respects_char_boundary() {
// Regression: next_chunk's cutoff (len - min_bytes_to_buffer) must be
// aligned to a UTF-8 char boundary like push_token/flush; otherwise
// streaming multi-byte output (CJK/emoji) with a hold-back buffer (set
// by a stop string) panics slicing cumulative_output mid-character.
let backend = Utf8Backend;
let mut decoder = backend.create_decode_stream(&[], false, 2);
let mut out = String::new();
for byte in "你好A".bytes() {
decoder.push_token(u32::from(byte)).unwrap();
if let Some(chunk) = decoder.next_chunk() {
out.push_str(&chunk);
}
}
let (last_chunk, full_text) = decoder.flush(None).unwrap();
if let Some(chunk) = last_chunk {
out.push_str(&chunk);
}
assert_eq!(full_text, "你好A");
assert_eq!(out, "你好A");
}
}