mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* chore: Refactor to reduce duplicated code in disagg server, reuse trtllm-serve Signed-off-by: Patrice Castonguay <55748270+pcastonguay@users.noreply.github.com> * Updating README, removing launch script Signed-off-by: Patrice Castonguay <55748270+pcastonguay@users.noreply.github.com> * Fixing integration tests Signed-off-by: Patrice Castonguay <55748270+pcastonguay@users.noreply.github.com> * Adding scripts to populate urls section of disagg config based on SLURM env vars Signed-off-by: Patrice Castonguay <55748270+pcastonguay@users.noreply.github.com> --------- Signed-off-by: Patrice Castonguay <55748270+pcastonguay@users.noreply.github.com>
23 lines
507 B
YAML
23 lines
507 B
YAML
hostname: localhost
|
|
port: 8000
|
|
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
|
|
free_gpu_memory_fraction: 0.25
|
|
backend: "pytorch"
|
|
pytorch_backend_config:
|
|
use_cuda_graph: False
|
|
enable_overlap_scheduler: False
|
|
context_servers:
|
|
num_instances: 1
|
|
tensor_parallel_size: 1
|
|
pipeline_parallel_size: 1
|
|
kv_cache_config:
|
|
free_gpu_memory_fraction: 0.2
|
|
urls:
|
|
- "localhost:8001"
|
|
generation_servers:
|
|
num_instances: 1
|
|
tensor_parallel_size: 1
|
|
pipeline_parallel_size: 1
|
|
urls:
|
|
- "localhost:8002"
|