From 19a3031ecb0b4ce96ef135a5c29870fef2f4fa71 Mon Sep 17 00:00:00 2001
From: Pamela Peng <179191831+pamelap-nvidia@users.noreply.github.com>
Date: Fri, 13 Feb 2026 15:29:41 -0500
Subject: [PATCH] [TRTLLM-10329][feat] Fix weight loading for Nemotron 3 models
 on DGX Spark (#11405)

Signed-off-by: Pamela <179191831+pamelap-nvidia@users.noreply.github.com>
---
 .../hf/nemotron_h_weight_mapper.py            |  4 +++-
 .../_torch/modules/fused_moe/quantization.py  | 19 ++++++++++++++++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/nemotron_h_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/nemotron_h_weight_mapper.py
index 25c5764b94..9d37995e91 100644
--- a/tensorrt_llm/_torch/models/checkpoints/hf/nemotron_h_weight_mapper.py
+++ b/tensorrt_llm/_torch/models/checkpoints/hf/nemotron_h_weight_mapper.py
@@ -80,7 +80,9 @@ class NemotronHHfWeightMapper(HfWeightMapper):
             elif "A" in key:
                 w = split(weights[name], tp_size, tp_rank)
                 w = w.to(torch.float32)
-                w = -torch.exp(w)
+                # Avoid extra temporaries: one fp32 cast, then in-place exp/neg.
+                w.exp_()
+                w.neg_()
                 new_weights[key] = w
             elif "D" in key:
                 w = split(weights[name], tp_size, tp_rank)
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
index 85fb1332a5..8ed02847dc 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/quantization.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -23,7 +23,7 @@ import torch
 import torch.nn.functional as F
 from torch import nn
 
-from tensorrt_llm._utils import get_sm_version, is_sm_100f
+from tensorrt_llm._utils import get_sm_version, is_device_integrated, is_sm_100f
 from tensorrt_llm.logger import logger
 from tensorrt_llm.quantization.functional import \
     preprocess_weights_for_mixed_gemm
@@ -38,6 +38,7 @@ from ...utils import (replace_parameter_and_save_metadata, swizzle_sf,
                       unswizzle_sf)
 from ..linear import TensorParallelMode, load_weight_shard
 from .interface import MoEWeightLoadingMode
+from .moe_load_balancer import advise_tensor_pageout
 
 # The declarations aligns with moe_kernels.h
 # pack inputs into int64, e.g. 4 x bf16 input values
@@ -306,6 +307,20 @@ class FusedMoEMethodBase(ABC):
             w3_w1_kargs["allow_partial_loading"] = allow_partial_loading
         if "allow_partial_loading" in w2_args:
             w2_kargs["allow_partial_loading"] = allow_partial_loading
+
+        def maybe_pageout_mmapped_cpu_weights(
+                weight_tensors: List[object]) -> None:
+            # Integrated GPU systems share physical memory with CPU. After we
+            # finish copying from mmapped CPU weights, proactively advising the
+            # kernel to drop those pages reduces shared-memory pressure.
+            if not is_device_integrated():
+                return
+            for weight in weight_tensors:
+                if (isinstance(weight, torch.Tensor)
+                        and weight.device.type == "cpu"
+                        and weight.is_contiguous()):
+                    advise_tensor_pageout(weight)
+
         # Multithread weight load is superseded by prefetch_files() in model_engine.py
         # Also, threading adds overhead in order to protect shuffle index cache with critical section.
         for local_slot_id, expert_id in enumerate(load_expert_ids):
@@ -361,6 +376,7 @@ class FusedMoEMethodBase(ABC):
                 if weight is not None
             ]
             module._add_raw_shared_weights_for_unmap(unmap_weights)
+            maybe_pageout_mmapped_cpu_weights(unmap_weights)
 
             if module.bias:
                 self.load_expert_w3_w1_weight(
@@ -375,6 +391,7 @@ class FusedMoEMethodBase(ABC):
                     if weight is not None
                 ]
                 module._add_raw_shared_weights_for_unmap(unmap_weights)
+                maybe_pageout_mmapped_cpu_weights(unmap_weights)
 
     def load_weights(self,
                      module: torch.nn.Module,