mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-16 15:55:08 +08:00
[None][chore] Merge residual+hidden into layer norm at the end of each NemotronH MTP, and remove a % operation (#11406)
Signed-off-by: Harris Nover <249353502+hnover-nv@users.noreply.github.com>
This commit is contained in:
parent
7a103035be
commit
2d5ebb3fe8
@ -658,11 +658,10 @@ class NemotronHMTPDecoderLayer(NemotronHLayer):
|
||||
)
|
||||
|
||||
if self.has_end_norm:
|
||||
if residual is not None:
|
||||
hidden_states = hidden_states + residual
|
||||
residual = None
|
||||
|
||||
hidden_states = self.final_layernorm(hidden_states)
|
||||
hidden_states, residual = self.final_layernorm(
|
||||
hidden_states, residual)
|
||||
# The last step, so don't forward the residual.
|
||||
residual = None
|
||||
|
||||
return hidden_states, residual
|
||||
|
||||
@ -690,9 +689,7 @@ class NemotronHMTP(nn.Module):
|
||||
# Build pattern-based layers
|
||||
self.layers = nn.ModuleDict()
|
||||
|
||||
for i in range(self.pattern_len):
|
||||
step_rel_idx = i % self.pattern_len
|
||||
|
||||
for step_rel_idx in range(self.pattern_len):
|
||||
char = self.pattern_str[step_rel_idx]
|
||||
|
||||
is_start_of_step = step_rel_idx == 0
|
||||
@ -710,7 +707,7 @@ class NemotronHMTP(nn.Module):
|
||||
skip_create_weights_in_init,
|
||||
)
|
||||
|
||||
self.layers[str(i)] = NemotronHMTPDecoderLayer(
|
||||
self.layers[str(step_rel_idx)] = NemotronHMTPDecoderLayer(
|
||||
model_config=sublayer_model_config,
|
||||
layer_idx=self.layer_idx,
|
||||
aux_stream_dict=aux_stream_dict,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user