mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
25 lines
559 B
YAML
25 lines
559 B
YAML
hostname: localhost
|
|
port: 8000
|
|
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
|
|
free_gpu_memory_fraction: 0.25
|
|
backend: "pytorch"
|
|
disable_overlap_scheduler: True
|
|
context_servers:
|
|
num_instances: 1
|
|
tensor_parallel_size: 1
|
|
pipeline_parallel_size: 1
|
|
kv_cache_config:
|
|
free_gpu_memory_fraction: 0.2
|
|
cache_transceiver_config:
|
|
backend: "DEFAULT"
|
|
urls:
|
|
- "localhost:8001"
|
|
generation_servers:
|
|
num_instances: 1
|
|
tensor_parallel_size: 1
|
|
pipeline_parallel_size: 1
|
|
cache_transceiver_config:
|
|
backend: "DEFAULT"
|
|
urls:
|
|
- "localhost:8002"
|