graphrag/tests/unit/indexing/text_splitting/test_text_splitting.py
Derek Worthen 2b70e4a4f3
Tokenizer (#2051)
* Add LiteLLM chat and embedding model providers.

* Fix code review findings.

* Add litellm.

* Fix formatting.

* Update dictionary.

* Update litellm.

* Fix embedding.

* Remove manual use of tiktoken and replace with
Tokenizer interface. Adds support for encoding
and decoding the models supported by litellm.

* Update litellm.

* Configure litellm to drop unsupported params.

* Cleanup semversioner release notes.

* Add num_tokens util to Tokenizer interface.

* Update litellm service factories.

* Cleanup litellm chat/embedding model argument assignment.

* Update chat and embedding type field for litellm use and future migration away from fnllm.

* Flatten litellm service organization.

* Update litellm.

* Update litellm factory validation.

* Flatten litellm rate limit service organization.

* Update rate limiter - disable with None/null instead of 0.

* Fix usage of get_tokenizer.

* Update litellm service registrations.

* Add jitter to exponential retry.

* Update validation.

* Update validation.

* Add litellm request logging layer.

* Update cache key.

* Update defaults.

---------

Co-authored-by: Alonso Guevara <alonsog@microsoft.com>
2025-09-22 13:55:14 -06:00

171 lines
4.8 KiB
Python

# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
from unittest import mock
from unittest.mock import MagicMock
import pytest
import tiktoken
from graphrag.index.text_splitting.text_splitting import (
NoopTextSplitter,
TokenChunkerOptions,
TokenTextSplitter,
split_multiple_texts_on_tokens,
split_single_text_on_tokens,
)
def test_noop_text_splitter() -> None:
splitter = NoopTextSplitter()
assert list(splitter.split_text("some text")) == ["some text"]
assert list(splitter.split_text(["some", "text"])) == ["some", "text"]
class MockTokenizer:
def encode(self, text):
return [ord(char) for char in text]
def decode(self, token_ids):
return "".join(chr(id) for id in token_ids)
def test_split_text_str_empty():
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=2)
result = splitter.split_text("")
assert result == []
def test_split_text_str_bool():
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=2)
result = splitter.split_text(None) # type: ignore
assert result == []
def test_split_text_str_int():
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=2)
with pytest.raises(TypeError):
splitter.split_text(123) # type: ignore
@mock.patch("graphrag.index.text_splitting.text_splitting.split_single_text_on_tokens")
def test_split_text_large_input(mock_split):
large_text = "a" * 10_000
mock_split.return_value = ["chunk"] * 2_000
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=2)
result = splitter.split_text(large_text)
assert len(result) == 2_000, "Large input was not split correctly"
mock_split.assert_called_once()
@mock.patch("graphrag.index.text_splitting.text_splitting.split_single_text_on_tokens")
@mock.patch("graphrag.index.text_splitting.text_splitting.TokenChunkerOptions")
def test_token_text_splitter(mock_tokenizer, mock_split_text):
text = "chunk1 chunk2 chunk3"
expected_chunks = ["chunk1", "chunk2", "chunk3"]
mocked_tokenizer = MagicMock()
mock_tokenizer.return_value = mocked_tokenizer
mock_split_text.return_value = expected_chunks
splitter = TokenTextSplitter()
splitter.split_text(["chunk1", "chunk2", "chunk3"])
mock_split_text.assert_called_once_with(text=text, tokenizer=mocked_tokenizer)
def test_split_single_text_on_tokens():
text = "This is a test text, meaning to be taken seriously by this test only."
mocked_tokenizer = MockTokenizer()
tokenizer = TokenChunkerOptions(
chunk_overlap=5,
tokens_per_chunk=10,
decode=mocked_tokenizer.decode,
encode=lambda text: mocked_tokenizer.encode(text),
)
expected_splits = [
"This is a ",
"is a test ",
"test text,",
"text, mean",
" meaning t",
"ing to be ",
"o be taken",
"taken seri", # cspell:disable-line
" seriously",
"ously by t", # cspell:disable-line
" by this t",
"his test o",
"est only.",
]
result = split_single_text_on_tokens(text=text, tokenizer=tokenizer)
assert result == expected_splits
def test_split_multiple_texts_on_tokens():
texts = [
"This is a test text, meaning to be taken seriously by this test only.",
"This is th second text, meaning to be taken seriously by this test only.",
]
mocked_tokenizer = MockTokenizer()
mock_tick = MagicMock()
tokenizer = TokenChunkerOptions(
chunk_overlap=5,
tokens_per_chunk=10,
decode=mocked_tokenizer.decode,
encode=lambda text: mocked_tokenizer.encode(text),
)
split_multiple_texts_on_tokens(texts, tokenizer, tick=mock_tick)
mock_tick.assert_called()
def test_split_single_text_on_tokens_no_overlap():
text = "This is a test text, meaning to be taken seriously by this test only."
enc = tiktoken.get_encoding("cl100k_base")
def encode(text: str) -> list[int]:
if not isinstance(text, str):
text = f"{text}"
return enc.encode(text)
def decode(tokens: list[int]) -> str:
return enc.decode(tokens)
tokenizer = TokenChunkerOptions(
chunk_overlap=1,
tokens_per_chunk=2,
decode=decode,
encode=lambda text: encode(text),
)
expected_splits = [
"This is",
" is a",
" a test",
" test text",
" text,",
", meaning",
" meaning to",
" to be",
" be taken", # cspell:disable-line
" taken seriously", # cspell:disable-line
" seriously by",
" by this", # cspell:disable-line
" this test",
" test only",
" only.",
]
result = split_single_text_on_tokens(text=text, tokenizer=tokenizer)
assert result == expected_splits