mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 00:57:23 +08:00
* Add LiteLLM chat and embedding model providers. * Fix code review findings. * Add litellm. * Fix formatting. * Update dictionary. * Update litellm. * Fix embedding. * Remove manual use of tiktoken and replace with Tokenizer interface. Adds support for encoding and decoding the models supported by litellm. * Update litellm. * Configure litellm to drop unsupported params. * Cleanup semversioner release notes. * Add num_tokens util to Tokenizer interface. * Update litellm service factories. * Cleanup litellm chat/embedding model argument assignment. * Update chat and embedding type field for litellm use and future migration away from fnllm. * Flatten litellm service organization. * Update litellm. * Update litellm factory validation. * Flatten litellm rate limit service organization. * Update rate limiter - disable with None/null instead of 0. * Fix usage of get_tokenizer. * Update litellm service registrations. * Add jitter to exponential retry. * Update validation. * Update validation. * Add litellm request logging layer. * Update cache key. * Update defaults. --------- Co-authored-by: Alonso Guevara <alonsog@microsoft.com>
171 lines
4.8 KiB
Python
171 lines
4.8 KiB
Python
# Copyright (c) 2024 Microsoft Corporation.
|
|
# Licensed under the MIT License
|
|
|
|
from unittest import mock
|
|
from unittest.mock import MagicMock
|
|
|
|
import pytest
|
|
import tiktoken
|
|
|
|
from graphrag.index.text_splitting.text_splitting import (
|
|
NoopTextSplitter,
|
|
TokenChunkerOptions,
|
|
TokenTextSplitter,
|
|
split_multiple_texts_on_tokens,
|
|
split_single_text_on_tokens,
|
|
)
|
|
|
|
|
|
def test_noop_text_splitter() -> None:
|
|
splitter = NoopTextSplitter()
|
|
|
|
assert list(splitter.split_text("some text")) == ["some text"]
|
|
assert list(splitter.split_text(["some", "text"])) == ["some", "text"]
|
|
|
|
|
|
class MockTokenizer:
|
|
def encode(self, text):
|
|
return [ord(char) for char in text]
|
|
|
|
def decode(self, token_ids):
|
|
return "".join(chr(id) for id in token_ids)
|
|
|
|
|
|
def test_split_text_str_empty():
|
|
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=2)
|
|
result = splitter.split_text("")
|
|
|
|
assert result == []
|
|
|
|
|
|
def test_split_text_str_bool():
|
|
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=2)
|
|
result = splitter.split_text(None) # type: ignore
|
|
|
|
assert result == []
|
|
|
|
|
|
def test_split_text_str_int():
|
|
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=2)
|
|
with pytest.raises(TypeError):
|
|
splitter.split_text(123) # type: ignore
|
|
|
|
|
|
@mock.patch("graphrag.index.text_splitting.text_splitting.split_single_text_on_tokens")
|
|
def test_split_text_large_input(mock_split):
|
|
large_text = "a" * 10_000
|
|
mock_split.return_value = ["chunk"] * 2_000
|
|
splitter = TokenTextSplitter(chunk_size=5, chunk_overlap=2)
|
|
|
|
result = splitter.split_text(large_text)
|
|
|
|
assert len(result) == 2_000, "Large input was not split correctly"
|
|
mock_split.assert_called_once()
|
|
|
|
|
|
@mock.patch("graphrag.index.text_splitting.text_splitting.split_single_text_on_tokens")
|
|
@mock.patch("graphrag.index.text_splitting.text_splitting.TokenChunkerOptions")
|
|
def test_token_text_splitter(mock_tokenizer, mock_split_text):
|
|
text = "chunk1 chunk2 chunk3"
|
|
expected_chunks = ["chunk1", "chunk2", "chunk3"]
|
|
|
|
mocked_tokenizer = MagicMock()
|
|
mock_tokenizer.return_value = mocked_tokenizer
|
|
mock_split_text.return_value = expected_chunks
|
|
|
|
splitter = TokenTextSplitter()
|
|
|
|
splitter.split_text(["chunk1", "chunk2", "chunk3"])
|
|
|
|
mock_split_text.assert_called_once_with(text=text, tokenizer=mocked_tokenizer)
|
|
|
|
|
|
def test_split_single_text_on_tokens():
|
|
text = "This is a test text, meaning to be taken seriously by this test only."
|
|
mocked_tokenizer = MockTokenizer()
|
|
tokenizer = TokenChunkerOptions(
|
|
chunk_overlap=5,
|
|
tokens_per_chunk=10,
|
|
decode=mocked_tokenizer.decode,
|
|
encode=lambda text: mocked_tokenizer.encode(text),
|
|
)
|
|
|
|
expected_splits = [
|
|
"This is a ",
|
|
"is a test ",
|
|
"test text,",
|
|
"text, mean",
|
|
" meaning t",
|
|
"ing to be ",
|
|
"o be taken",
|
|
"taken seri", # cspell:disable-line
|
|
" seriously",
|
|
"ously by t", # cspell:disable-line
|
|
" by this t",
|
|
"his test o",
|
|
"est only.",
|
|
]
|
|
|
|
result = split_single_text_on_tokens(text=text, tokenizer=tokenizer)
|
|
assert result == expected_splits
|
|
|
|
|
|
def test_split_multiple_texts_on_tokens():
|
|
texts = [
|
|
"This is a test text, meaning to be taken seriously by this test only.",
|
|
"This is th second text, meaning to be taken seriously by this test only.",
|
|
]
|
|
|
|
mocked_tokenizer = MockTokenizer()
|
|
mock_tick = MagicMock()
|
|
tokenizer = TokenChunkerOptions(
|
|
chunk_overlap=5,
|
|
tokens_per_chunk=10,
|
|
decode=mocked_tokenizer.decode,
|
|
encode=lambda text: mocked_tokenizer.encode(text),
|
|
)
|
|
|
|
split_multiple_texts_on_tokens(texts, tokenizer, tick=mock_tick)
|
|
mock_tick.assert_called()
|
|
|
|
|
|
def test_split_single_text_on_tokens_no_overlap():
|
|
text = "This is a test text, meaning to be taken seriously by this test only."
|
|
enc = tiktoken.get_encoding("cl100k_base")
|
|
|
|
def encode(text: str) -> list[int]:
|
|
if not isinstance(text, str):
|
|
text = f"{text}"
|
|
return enc.encode(text)
|
|
|
|
def decode(tokens: list[int]) -> str:
|
|
return enc.decode(tokens)
|
|
|
|
tokenizer = TokenChunkerOptions(
|
|
chunk_overlap=1,
|
|
tokens_per_chunk=2,
|
|
decode=decode,
|
|
encode=lambda text: encode(text),
|
|
)
|
|
|
|
expected_splits = [
|
|
"This is",
|
|
" is a",
|
|
" a test",
|
|
" test text",
|
|
" text,",
|
|
", meaning",
|
|
" meaning to",
|
|
" to be",
|
|
" be taken", # cspell:disable-line
|
|
" taken seriously", # cspell:disable-line
|
|
" seriously by",
|
|
" by this", # cspell:disable-line
|
|
" this test",
|
|
" test only",
|
|
" only.",
|
|
]
|
|
|
|
result = split_single_text_on_tokens(text=text, tokenizer=tokenizer)
|
|
assert result == expected_splits
|