[None][chore] Merge residual+hidden into layer norm at the end of each NemotronH MTP, and remove a % operation (#11406)

Signed-off-by: Harris Nover <249353502+hnover-nv@users.noreply.github.com>
This commit is contained in:
Harris Nover 2026-02-11 10:01:36 -07:00 committed by GitHub
parent 7a103035be
commit 2d5ebb3fe8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -658,11 +658,10 @@ class NemotronHMTPDecoderLayer(NemotronHLayer):
)
if self.has_end_norm:
if residual is not None:
hidden_states = hidden_states + residual
residual = None
hidden_states = self.final_layernorm(hidden_states)
hidden_states, residual = self.final_layernorm(
hidden_states, residual)
# The last step, so don't forward the residual.
residual = None
return hidden_states, residual
@ -690,9 +689,7 @@ class NemotronHMTP(nn.Module):
# Build pattern-based layers
self.layers = nn.ModuleDict()
for i in range(self.pattern_len):
step_rel_idx = i % self.pattern_len
for step_rel_idx in range(self.pattern_len):
char = self.pattern_str[step_rel_idx]
is_start_of_step = step_rel_idx == 0
@ -710,7 +707,7 @@ class NemotronHMTP(nn.Module):
skip_create_weights_in_init,
)
self.layers[str(i)] = NemotronHMTPDecoderLayer(
self.layers[str(step_rel_idx)] = NemotronHMTPDecoderLayer(
model_config=sublayer_model_config,
layer_idx=self.layer_idx,
aux_stream_dict=aux_stream_dict,