Fix tokenizer removal from chunker

This commit is contained in:
Nathan Evans 2025-12-22 11:20:43 -08:00
parent 90479c0b1c
commit b32f403e8f
3 changed files with 4 additions and 4 deletions

View File

@ -38,8 +38,8 @@ def register_chunker(
def create_chunker(
config: ChunkingConfig,
encode: Callable[[str], list[int]] | None,
decode: Callable[[list[int]], str] | None,
encode: Callable[[str], list[int]] | None = None,
decode: Callable[[list[int]], str] | None = None,
) -> Chunker:
"""Create a chunker implementation based on the given configuration.

View File

@ -62,7 +62,7 @@ async def load_docs_in_chunks(
cache=NoopCache(),
)
tokenizer = get_tokenizer(embeddings_llm_settings)
chunker = create_chunker(config.chunks, tokenizer)
chunker = create_chunker(config.chunks, tokenizer.encode, tokenizer.decode)
input_storage = create_storage(config.input.storage)
input_reader = InputReaderFactory().create(
config.input.file_type,

View File

@ -66,7 +66,7 @@ class TestRunTokens:
strategy=ChunkStrategyType.Tokens,
)
chunker = create_chunker(config, tokenizer=tokenizer)
chunker = create_chunker(config, mock_encoder.encode, mock_encoder.decode)
chunks = chunker.chunk(input)
assert len(chunks) > 0