mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 00:57:23 +08:00
Fix tokenizer removal from chunker
This commit is contained in:
parent
90479c0b1c
commit
b32f403e8f
@ -38,8 +38,8 @@ def register_chunker(
|
||||
|
||||
def create_chunker(
|
||||
config: ChunkingConfig,
|
||||
encode: Callable[[str], list[int]] | None,
|
||||
decode: Callable[[list[int]], str] | None,
|
||||
encode: Callable[[str], list[int]] | None = None,
|
||||
decode: Callable[[list[int]], str] | None = None,
|
||||
) -> Chunker:
|
||||
"""Create a chunker implementation based on the given configuration.
|
||||
|
||||
|
||||
@ -62,7 +62,7 @@ async def load_docs_in_chunks(
|
||||
cache=NoopCache(),
|
||||
)
|
||||
tokenizer = get_tokenizer(embeddings_llm_settings)
|
||||
chunker = create_chunker(config.chunks, tokenizer)
|
||||
chunker = create_chunker(config.chunks, tokenizer.encode, tokenizer.decode)
|
||||
input_storage = create_storage(config.input.storage)
|
||||
input_reader = InputReaderFactory().create(
|
||||
config.input.file_type,
|
||||
|
||||
@ -66,7 +66,7 @@ class TestRunTokens:
|
||||
strategy=ChunkStrategyType.Tokens,
|
||||
)
|
||||
|
||||
chunker = create_chunker(config, tokenizer=tokenizer)
|
||||
chunker = create_chunker(config, mock_encoder.encode, mock_encoder.decode)
|
||||
chunks = chunker.chunk(input)
|
||||
|
||||
assert len(chunks) > 0
|
||||
|
||||
Loading…
Reference in New Issue
Block a user