mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[None][chore] Add placement test for ray executor (#9122)
Signed-off-by: Erin Ho <14718778+hchings@users.noreply.github.com>
This commit is contained in:
parent
bdcf837784
commit
fe69243157
@ -1,10 +1,14 @@
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import ray
|
||||||
|
from ray.util.placement_group import (PlacementGroupSchedulingStrategy,
|
||||||
|
placement_group, remove_placement_group)
|
||||||
from utils.llm_data import llm_models_root
|
from utils.llm_data import llm_models_root
|
||||||
|
|
||||||
from tensorrt_llm import LLM
|
from tensorrt_llm import LLM
|
||||||
from tensorrt_llm._torch.utils import get_device_uuid
|
from tensorrt_llm._torch.utils import get_device_uuid
|
||||||
|
from tensorrt_llm.llmapi import KvCacheConfig
|
||||||
|
|
||||||
|
|
||||||
class DummyWorkerExtension:
|
class DummyWorkerExtension:
|
||||||
@ -22,10 +26,62 @@ def test_worker_extension():
|
|||||||
assert result[0] == "SUCCESS"
|
assert result[0] == "SUCCESS"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.gpu4
|
||||||
|
def test_bundle_indices(monkeypatch):
|
||||||
|
"""Placement via bundle indices"""
|
||||||
|
|
||||||
|
monkeypatch.setenv("RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES", "1")
|
||||||
|
monkeypatch.setenv("TLLM_RAY_USE_RPC", "1")
|
||||||
|
|
||||||
|
pg = None
|
||||||
|
try:
|
||||||
|
ray.init()
|
||||||
|
pg = placement_group([{"GPU": 1, "CPU": 1}] * 4)
|
||||||
|
ray.get(pg.ready())
|
||||||
|
print(f"Placement group ready with bundles {pg.bundle_specs}")
|
||||||
|
|
||||||
|
bundle_indices = [2, 3]
|
||||||
|
runtime_env = {
|
||||||
|
"env_vars": {
|
||||||
|
"TRTLLM_RAY_PER_WORKER_GPUS": "0.8",
|
||||||
|
"TRTLLM_RAY_BUNDLE_INDICES": ",".join(map(str, bundle_indices))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
llm = ray.remote(
|
||||||
|
num_cpus=0,
|
||||||
|
num_gpus=0,
|
||||||
|
runtime_env=runtime_env,
|
||||||
|
scheduling_strategy=PlacementGroupSchedulingStrategy(
|
||||||
|
placement_group=pg,
|
||||||
|
placement_group_capture_child_tasks=True,
|
||||||
|
),
|
||||||
|
)(LLM).remote(
|
||||||
|
model=os.path.join(llm_models_root(), "llama-models-v2",
|
||||||
|
"TinyLlama-1.1B-Chat-v1.0"),
|
||||||
|
kv_cache_config=KvCacheConfig(free_gpu_memory_fraction=0.1),
|
||||||
|
tensor_parallel_size=2,
|
||||||
|
orchestrator_type="ray",
|
||||||
|
)
|
||||||
|
|
||||||
|
inference_actor_uuids = ray.get(
|
||||||
|
llm._collective_rpc.remote("report_device_id"))
|
||||||
|
|
||||||
|
expected_uuids = [get_device_uuid(idx) for idx in bundle_indices]
|
||||||
|
|
||||||
|
assert sorted(inference_actor_uuids) == sorted(expected_uuids), \
|
||||||
|
f"Workers not placed on expected GPUs. Expected UUIDs: {expected_uuids}, Got: {inference_actor_uuids}"
|
||||||
|
|
||||||
|
finally:
|
||||||
|
if pg is not None:
|
||||||
|
remove_placement_group(pg)
|
||||||
|
ray.shutdown()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.gpu2
|
@pytest.mark.gpu2
|
||||||
def test_cuda_visible_device():
|
def test_cuda_visible_device(monkeypatch):
|
||||||
"""Placement via cuda_visible_device"""
|
"""Placement via cuda_visible_device"""
|
||||||
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "1")
|
||||||
|
|
||||||
llm = LLM(model=llm_models_root() /
|
llm = LLM(model=llm_models_root() /
|
||||||
"llama-models-v2/TinyLlama-1.1B-Chat-v1.0",
|
"llama-models-v2/TinyLlama-1.1B-Chat-v1.0",
|
||||||
@ -33,6 +89,5 @@ def test_cuda_visible_device():
|
|||||||
|
|
||||||
infer_actor_uuids = llm._collective_rpc("report_device_id")
|
infer_actor_uuids = llm._collective_rpc("report_device_id")
|
||||||
|
|
||||||
del os.environ["CUDA_VISIBLE_DEVICES"]
|
|
||||||
assert infer_actor_uuids[0] == get_device_uuid(1)
|
assert infer_actor_uuids[0] == get_device_uuid(1)
|
||||||
print(f"{infer_actor_uuids=}")
|
print(f"{infer_actor_uuids=}")
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user