From 505678a286f680855e123fc12a2fb65c384e2f5b Mon Sep 17 00:00:00 2001
From: bhsueh_NV <11360707+byshiue@users.noreply.github.com>
Date: Thu, 12 Jun 2025 14:40:57 +0800
Subject: [PATCH] update the free_gpu_mem_fraction for H100 qwen3 qa test
 (#5114)

Signed-off-by: root <root@eos0274.eos.clusters.nvidia.com>
Co-authored-by: root <root@eos0274.eos.clusters.nvidia.com>
---
 tests/integration/defs/accuracy/test_llm_api_pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 269c597120..b98f4ae712 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -1373,7 +1373,7 @@ class TestQwen3_235B_A22B(LlmapiAccuracyTestHarness):
         pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler,
                               use_cuda_graph=cuda_graph)
 
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
         llm = LLM(
             f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",
             tensor_parallel_size=tp_size,