[https://nvbugs/5670469][fix] Filter 0s and choose min of kv_head for Nemotron model (#10206)

Signed-off-by: Faraz Khoubsirat <58580514+farazkh80@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2026-01-04 19:42:53 -05:00 · 2026-01-04 19:42:53 -05:00 · 8e2065b4d9
commit 8e2065b4d9
parent e2f5455533
1 changed files with 6 additions and 1 deletions
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@ -1070,7 +1070,12 @@ class PyTorchModelEngine(ModelEngine):
        num_attention_heads = getattr(config, 'num_attention_heads', None)
        num_key_value_heads = getattr(config, 'num_key_value_heads', None)

-        if num_attention_heads is not None and num_key_value_heads is not None:
+        # Calculate the number of attention heads per KV head (GQA ratio)
+        if isinstance(num_key_value_heads, (list, tuple)):
+            # Filter out invalid KV heads, default to 0 if no valid KV heads are found
+            num_key_value_heads = min(
+                (kv for kv in num_key_value_heads if kv and kv > 0), default=0)
+        if num_attention_heads and num_key_value_heads:
            num_heads_per_kv = num_attention_heads // num_key_value_heads
        else:
            num_heads_per_kv = 1