mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 09:07:20 +08:00
Fix tokenizer removal from chunker
This commit is contained in:
parent
90479c0b1c
commit
b32f403e8f
@ -38,8 +38,8 @@ def register_chunker(
|
|||||||
|
|
||||||
def create_chunker(
|
def create_chunker(
|
||||||
config: ChunkingConfig,
|
config: ChunkingConfig,
|
||||||
encode: Callable[[str], list[int]] | None,
|
encode: Callable[[str], list[int]] | None = None,
|
||||||
decode: Callable[[list[int]], str] | None,
|
decode: Callable[[list[int]], str] | None = None,
|
||||||
) -> Chunker:
|
) -> Chunker:
|
||||||
"""Create a chunker implementation based on the given configuration.
|
"""Create a chunker implementation based on the given configuration.
|
||||||
|
|
||||||
|
|||||||
@ -62,7 +62,7 @@ async def load_docs_in_chunks(
|
|||||||
cache=NoopCache(),
|
cache=NoopCache(),
|
||||||
)
|
)
|
||||||
tokenizer = get_tokenizer(embeddings_llm_settings)
|
tokenizer = get_tokenizer(embeddings_llm_settings)
|
||||||
chunker = create_chunker(config.chunks, tokenizer)
|
chunker = create_chunker(config.chunks, tokenizer.encode, tokenizer.decode)
|
||||||
input_storage = create_storage(config.input.storage)
|
input_storage = create_storage(config.input.storage)
|
||||||
input_reader = InputReaderFactory().create(
|
input_reader = InputReaderFactory().create(
|
||||||
config.input.file_type,
|
config.input.file_type,
|
||||||
|
|||||||
@ -66,7 +66,7 @@ class TestRunTokens:
|
|||||||
strategy=ChunkStrategyType.Tokens,
|
strategy=ChunkStrategyType.Tokens,
|
||||||
)
|
)
|
||||||
|
|
||||||
chunker = create_chunker(config, tokenizer=tokenizer)
|
chunker = create_chunker(config, mock_encoder.encode, mock_encoder.decode)
|
||||||
chunks = chunker.chunk(input)
|
chunks = chunker.chunk(input)
|
||||||
|
|
||||||
assert len(chunks) > 0
|
assert len(chunks) > 0
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user