TensorRT-LLMs/tensorrt_llm/auto_parallel/simplifier.py

import math
import re
from dataclasses import dataclass
from enum import Enum
from typing import Dict, List, Tuple

import numpy as np

from tensorrt_llm.network import Network

from .config import AutoParallelConfig
from .device_mesh import PhysicalDeviceMesh
from .pipeline_graph import PipelineGraph
from .shape_info import ShapeInfo, ShapeType, get_shape_info
from .tensor_parallel.p2p_node import P2PType
from .utils import get_cache_key, get_sorted_layer_ids, silent_trt_logger


class StageType(Enum):
    START = 0
    BLOCK = 1
    END = 2


class BuildingBlock:

    def __init__(self, graph, layer_range) -> None:
        self.graph = graph
        self.layer_range = layer_range
        self.network = graph.as_trt()
        self.owned_inputs = {}
        self.is_edges_collected = False
        self.intra_edges = []
        self.src_inter_edges = []
        self.dst_inter_edges = []
        self.relative_src_inter_edges = []
        self.relative_dst_inter_edges = []
        self.relative_inter_edges = set()
        self.edge_hash = None
        self.outputs = None
        self.type_id = -1
        self.block_id = -1
        self.p2p_type = None
        self.is_superset = False
        self.is_subset = False
        self.sorted_layer_ids = []

    def collect_edges(self):
        if self.is_edges_collected:
            return
        for layer_index in self.layer_range:
            trt_layer = self.network.get_layer(layer_index)
            layer = self.graph.get_layer(trt_layer.name)
            layer_offset = layer.index - self.layer_range.start
            for input_index, input in enumerate(layer.inputs):
                if input is not None:
                    if input.is_graph_input:
                        is_owned = input.graph_input_index in self.owned_inputs
                        if not is_owned and np.all([
                                layer.index in self.layer_range or np.all([
                                    output.as_trt().is_shape_tensor
                                    for output in layer.outputs
                                ]) for layer, _ in input.consumers
                        ]):
                            self.owned_inputs[input.graph_input_index] = len(
                                self.owned_inputs)
                            is_owned = True
                        if is_owned:
                            self.intra_edges.append(
                                (-1, self.owned_inputs[input.graph_input_index],
                                 layer_offset, input_index))
                        else:
                            self.dst_inter_edges.append(
                                (-1, input.graph_input_index, layer_offset,
                                 input_index))
                    else:
                        src_layer_index = input.producer.index
                        if src_layer_index < self.layer_range.start or src_layer_index >= self.layer_range.stop:
                            self.dst_inter_edges.append(
                                (src_layer_index, input.output_index,
                                 layer_offset, input_index))
                        else:
                            src_layer_offset = src_layer_index - self.layer_range.start
                            self.intra_edges.append(
                                (src_layer_offset, input.output_index,
                                 layer_offset, input_index))
            for output_index, output in enumerate(layer.outputs):
                for dst_layer, dst_input_index in output.consumers:
                    dst_layer_index = dst_layer.index
                    if dst_layer_index < self.layer_range.start or dst_layer_index >= self.layer_range.stop:
                        self.src_inter_edges.append(
                            (layer_offset, output_index, dst_layer_index,
                             dst_input_index))
        self.edge_hash = tuple(self.intra_edges)
        self.outputs = sorted(
            set((edge[0], edge[1]) for edge in self.src_inter_edges))
        self.is_edges_collected = True

    def collect_relative_inter_edges(self, layer_to_block):
        self.collect_edges()
        for src_layer_index, src_output_index, dst_layer_index, dst_input_index in self.dst_inter_edges:
            if src_layer_index in layer_to_block:
                src_block = layer_to_block[src_layer_index]
                src_layer_offset = src_layer_index - src_block.layer_range.start
                dst = (self.type_id, dst_layer_index, dst_input_index)
                self.relative_dst_inter_edges.append(
                    (src_block.type_id, src_layer_offset, src_output_index,
                     *dst))
            else:
                self.relative_dst_inter_edges.append(
                    (-1, src_layer_index, src_output_index, self.type_id,
                     dst_layer_index, dst_input_index))
        self.relative_inter_edges = set(self.relative_dst_inter_edges +
                                        self.outputs)

    def get_input_names(self):
        self.collect_edges()
        input_tensor_names = []
        for edge in self.dst_inter_edges:
            layer_index = edge[0]
            output_index = edge[1]
            if layer_index == -1:
                tensor_name = self.network.get_input(output_index).name
            else:
                tensor_name = self.network.get_layer(layer_index).get_output(
                    output_index).name
            input_tensor_names.append(tensor_name)
        return input_tensor_names

    def get_input_mapping(self, last_blocks):
        input_mapping = {}
        for tensor_name, relative_edge in zip(self.get_input_names(),
                                              self.relative_dst_inter_edges):
            type_id = relative_edge[0]
            output_index = relative_edge[2]
            if type_id >= 0:
                last_block = last_blocks[type_id]
                layer_offset = relative_edge[1]
                mapped_layer_index = last_block.layer_range.start + layer_offset
                mapped_tensor_name = self.network.get_layer(
                    mapped_layer_index).get_output(output_index).name
                input_mapping[tensor_name] = mapped_tensor_name
            else:
                input_mapping[tensor_name] = tensor_name
        return input_mapping


@dataclass
class GraphMapping:
    layer_mapping: Dict[int, int] = None
    block_mapping: Dict[int, int] = None
    p2p_types: Dict[int, P2PType] = None
    p2p_tensors: Dict[int, List[str]] = None
    block_to_stage: Dict[int, int] = None
    same_spec_layer_mapping: Dict[str, str] = None


@dataclass
class GraphConfig:
    num_micro_batches: int = 1
    num_blocks: int = 1
    num_stages: int = 1
    has_cross_device: bool = False
    has_cross_host: bool = False
    graph_mapping: GraphMapping = None
    phy_mesh: PhysicalDeviceMesh = None
    stage_phy_meshes: List[PhysicalDeviceMesh] = None


class Simplifier:

    def __init__(self, network: Network, config: AutoParallelConfig):
        self.config = config
        self.sharded_io_allowlist = config.sharded_io_allowlist
        self.same_buffer_io = config.same_buffer_io
        self.same_spec_io = config.same_spec_io.copy()
        for key, value in self.same_buffer_io.items():
            if key not in self.same_spec_io:
                self.same_spec_io[key] = value

        self.llm_network = network
        self.network = network.trt_network
        self.module_to_layer_range_map = network._module_call_stack.module_to_layer_range_map
        self.graph = self.get_graph()
        self.init_layer_hash()

        module_tree = self.get_module_tree()
        building_blocks = self.collect_building_blocks(module_tree)
        blocks_by_module_hash = self.get_blocks_by_module_hash(building_blocks)
        self.blocks_by_edge_hash = self.get_blocks_by_edge_hash(
            blocks_by_module_hash)
        self.layer_to_block = self.get_layer_to_block()
        self.blocks = self.get_all_blocks()
        self.backbone_blocks = self.get_backbone_blocks()
        self.graph_mapping_for_shape = self.get_graph_mapping_for_shape()
        self.graph_for_shape = self.create_simplified_graph_for_shape()
        self.shape_info = None
        self.num_micro_batches = None

    def infer_shapes(self, num_micro_batches):
        if self.num_micro_batches == num_micro_batches:
            return
        with silent_trt_logger():
            self.shape_info = self.get_full_shape_info(num_micro_batches)
            self.graph.assign_shapes(self.shape_info)
            self.num_micro_batches = num_micro_batches

    def list_all_num_micro_batches(self):
        opt_batch_size = self.get_opt_batch_size()
        candidates = []
        for num_micro_batches in range(1, self.get_opt_batch_size() + 1):
            if opt_batch_size % num_micro_batches == 0:
                candidates.append(num_micro_batches)
        return candidates

    def get_graph(self):
        graph = PipelineGraph.from_trt(self.network)
        graph._unfilled_weights = self.llm_network._unfilled_weights.copy()
        graph._io_buffer_mapping
        for input in graph.inputs:
            input_name = input.name
            for pattern, repl in self.same_buffer_io.items():
                if re.match(pattern, input_name):
                    output_name = re.sub(pattern, repl, input_name)
                    output = graph.get_output(output_name)
                    if output is not None:
                        graph._io_buffer_mapping[output_name] = input_name
        return graph

    def get_opt_batch_size(self):
        input_tensors = self.llm_network._inputs
        num_profiles = len(list(input_tensors.values())[0].profiles)
        opt_batch_sizes = []
        for i in range(num_profiles):
            for input_tensor in input_tensors.values():
                shape_profile = input_tensor.profiles[i]
                opt_shape = shape_profile.opt
                for j in range(len(input_tensor.shape)):
                    name = input_tensor.trt_tensor.get_dimension_name(j)
                    if name == 'batch_size':
                        opt_batch_sizes.append(opt_shape[j])
        return min(opt_batch_sizes)

    def get_module_hash(self, layer_range):
        module_hash = ()
        for i in layer_range:
            assert i < self.network.num_layers, f"layer index {i} in {layer_range} out of range of {self.network.num_layers}"
            layer_name = self.network.get_layer(i).name
            layer = self.graph.get_layer(layer_name)
            module_hash += (layer.attrs["hash"], )
        return module_hash

    def get_network_hash(self) -> str:
        return str(self.get_module_hash(range(self.network.num_layers)))

    def collect_building_blocks(self, module_tree):
        building_blocks = {}
        queue = []
        for tree in module_tree["children"].values():
            queue.append(tree)
        while len(queue) > 0:
            while len(queue) > 0:
                tree = queue.pop(0)
                module_name = tree["name"]
                if module_name is None:
                    for child in tree["children"].values():
                        queue.append(child)
                    continue
                layer_range = self.module_to_layer_range_map[module_name]
                module_hash = self.get_module_hash(layer_range)
                if module_hash in building_blocks:
                    building_blocks[module_hash].append(tree)
                else:
                    building_blocks[module_hash] = [tree]
            for module_hash in [*building_blocks.keys()]:
                if len(building_blocks[module_hash]) == 1:
                    tree = building_blocks[module_hash][0]
                    for child in tree["children"].values():
                        queue.append(child)
                    del building_blocks[module_hash]
        blocks_by_module_hash = {
            module_hash: [
                BuildingBlock(self.graph,
                              self.module_to_layer_range_map[tree["name"]])
                for tree in trees
            ]
            for module_hash, trees in building_blocks.items()
        }
        building_blocks = []
        for block_list in blocks_by_module_hash.values():
            for block in block_list:
                building_blocks.append(block)
        building_blocks = sorted(building_blocks,
                                 key=lambda x: x.layer_range.start)
        if len(building_blocks) >= 2:
            for block, next_block in zip(building_blocks[:-1],
                                         building_blocks[1:]):
                block.layer_range = range(block.layer_range.start,
                                          next_block.layer_range.start)
        return building_blocks

    def get_all_blocks(self):
        building_blocks = []
        for block_list in self.blocks_by_edge_hash.values():
            for block in block_list:
                building_blocks.append(block)
        building_blocks = sorted(building_blocks,
                                 key=lambda x: x.layer_range.start)
        all_blocks = []
        current_layer_index = 0
        block_id = 0
        for block in building_blocks:
            assert current_layer_index <= block.layer_range.start
            if current_layer_index < block.layer_range.start:
                new_block = BuildingBlock(
                    self.graph,
                    range(current_layer_index, block.layer_range.start))
                new_block.block_id = block_id
                block_id += 1
                all_blocks.append(new_block)
            block.block_id = block_id
            block_id += 1
            all_blocks.append(block)
            current_layer_index = block.layer_range.stop
        if current_layer_index < self.graph.num_layers:
            new_block = BuildingBlock(
                self.graph, range(current_layer_index, self.graph.num_layers))
            new_block.block_id = block_id
            all_blocks.append(new_block)
        sorted_layer_ids = get_sorted_layer_ids(self.network)
        for block in all_blocks:
            block.collect_relative_inter_edges(self.layer_to_block)
            for layer_id in sorted_layer_ids:
                if layer_id in block.layer_range:
                    block.sorted_layer_ids.append(layer_id)
        return all_blocks

    def get_backbone_blocks(self):
        sorted_blocks = sorted(
            self.blocks_by_edge_hash.values(),
            key=lambda blocks: (len(blocks), len(blocks[0].layer_range)),
        )
        if len(sorted_blocks) == 0:
            return []
        else:
            return sorted_blocks[-1]

    def get_blocks_by_module_hash(self, blocks):
        blocks_by_module_hash = {}
        for block in blocks:
            module_hash = self.get_module_hash(block.layer_range)
            if module_hash not in blocks_by_module_hash:
                blocks_by_module_hash[module_hash] = []
            blocks_by_module_hash[module_hash].append(block)
        for module_hash in [*blocks_by_module_hash.keys()]:
            if len(blocks_by_module_hash[module_hash]) == 1:
                del blocks_by_module_hash[module_hash]
        return blocks_by_module_hash

    def get_module_tree(self):
        module_tree = {"children": {}, "name": None}
        for module_name in self.module_to_layer_range_map.keys():
            full_name = module_name.split('.')
            current_tree = module_tree["children"]
            for depth, name in enumerate(full_name):
                if name not in current_tree:
                    current_tree[name] = {"children": {}, "name": None}
                if depth == len(full_name) - 1:
                    current_tree[name]["name"] = module_name
                else:
                    current_tree = current_tree[name]["children"]
        return module_tree

    def get_blocks_by_edge_hash(self, blocks_by_module_hash):
        blocks_by_edge_hash = {}
        for block_list in blocks_by_module_hash.values():
            for block in block_list:
                block.collect_edges()
                edge_hash = block.edge_hash
                if edge_hash not in blocks_by_edge_hash:
                    blocks_by_edge_hash[edge_hash] = []
                blocks_by_edge_hash[edge_hash].append(block)
        for edge_hash in [*blocks_by_edge_hash.keys()]:
            if len(blocks_by_edge_hash[edge_hash]) == 1:
                del blocks_by_edge_hash[edge_hash]
            else:
                block_list = blocks_by_edge_hash[edge_hash]
                blocks_by_edge_hash[edge_hash] = sorted(
                    block_list, key=lambda x: x.layer_range.start)
        for type_id, block_list in enumerate(blocks_by_edge_hash.values()):
            for block in block_list:
                block.type_id = type_id
        return blocks_by_edge_hash

    def get_layer_to_block(self):
        layer_to_block = {}
        for block_list in self.blocks_by_edge_hash.values():
            for block in block_list:
                for layer_index in block.layer_range:
                    layer_to_block[layer_index] = block
        return layer_to_block

    def clean_blocks(self):
        for block in self.blocks:
            block.p2p_type = None
            block.is_superset = False
            block.is_subset = False

    def mark_p2p_type(self, phy_mesh, stage_phy_meshes,
                      graph_config: GraphConfig):
        if len(self.backbone_blocks) == 0 or len(stage_phy_meshes) == 1:
            return
        assert len(self.backbone_blocks) % len(stage_phy_meshes) == 0
        block_per_stage = len(self.backbone_blocks) // len(stage_phy_meshes)

        for block in self.backbone_blocks:
            block.p2p_type = None
        for stage_index, stage_phy_mesh in enumerate(stage_phy_meshes[:-1]):
            next_stage_phy_mesh = stage_phy_meshes[stage_index + 1]
            last_device_id = stage_phy_mesh.phy_devices_id.flatten()[-1]
            next_first_device_id = next_stage_phy_mesh.phy_devices_id.flatten(
            )[0]
            num_devices_per_host = phy_mesh.num_devices_per_host
            next_block = self.backbone_blocks[(stage_index + 1) *
                                              block_per_stage]
            if last_device_id // num_devices_per_host != next_first_device_id // num_devices_per_host:
                next_block.p2p_type = P2PType.CROSS_HOST
                graph_config.has_cross_host = True
            else:
                next_block.p2p_type = P2PType.CROSS_DEVICE
                graph_config.has_cross_device = True

    def get_graph_mapping(self):
        layer_mapping = {}
        block_mapping = {}
        p2p_types = {}
        p2p_tensors = {}
        for block_list in self.blocks_by_edge_hash.values():
            superset_blocks = []
            superset_block_index = {}
            for block in block_list:
                block_added = False
                for index, superset_block in enumerate(list(superset_blocks)):
                    if block.p2p_type == superset_block.p2p_type:
                        if block.relative_inter_edges.issubset(
                                superset_block.relative_inter_edges):
                            block.is_subset = True
                            block.is_superset = False
                            superset_block_index[id(block)] = index
                            block_added = True
                            break
                        elif superset_block.relative_inter_edges.issubset(
                                block.relative_inter_edges):
                            superset_block.is_subset = True
                            superset_block.is_superset = False
                            block.is_subset = False
                            block.is_superset = True
                            superset_blocks[index] = block
                            superset_block_index[id(block)] = index
                            block_added = True
                            break
                if not block_added:
                    block.is_subset = False
                    block.is_superset = True
                    superset_blocks.append(block)
                    superset_block_index[id(block)] = len(superset_blocks) - 1
            for block in block_list:
                assert not (block.is_subset and block.is_superset)
                if block.is_subset:
                    superset_block = superset_blocks[superset_block_index[id(
                        block)]]
                    block_mapping[block.block_id] = superset_block.block_id
                    owned_inputs = map(
                        lambda x: x[0],
                        sorted(block.owned_inputs.items(), key=lambda x: x[1]))
                    superset_owned_inputs = map(
                        lambda x: x[0],
                        sorted(superset_block.owned_inputs.items(),
                               key=lambda x: x[1]))
                    for from_input_id, to_input_id in zip(
                            owned_inputs, superset_owned_inputs):
                        from_input_name = self.network.get_input(
                            from_input_id).name
                        to_input_name = self.network.get_input(to_input_id).name
                        layer_mapping[from_input_name] = to_input_name
                    for from_layer_id, to_layer_id in zip(
                            block.layer_range, superset_block.layer_range):
                        from_layer = self.network.get_layer(from_layer_id)
                        to_layer = self.network.get_layer(to_layer_id)
                        layer_mapping[from_layer.name] = to_layer.name
                        for i in range(from_layer.num_outputs):
                            from_output = from_layer.get_output(i)
                            if from_output.is_network_output:
                                to_output = to_layer.get_output(i)
                                layer_mapping[from_output.name] = to_output.name
                    if block.p2p_type is not None:
                        p2p_types[block.block_id] = block.p2p_type
                        p2p_tensors[block.block_id] = [
                            *set(block.get_input_names())
                        ]
                        for from_name, to_name in zip(
                                block.get_input_names(),
                                superset_block.get_input_names()):
                            layer_mapping[
                                f"p2p_block{block.block_id}_{from_name}"] = f"p2p_block{superset_block.block_id}_{to_name}"
        stage_id = 0
        block_to_stage = {}
        for block in self.blocks:
            if block.p2p_type is not None:
                stage_id += 1
            block_to_stage[block.block_id] = stage_id
        return GraphMapping(
            layer_mapping,
            block_mapping,
            p2p_types,
            p2p_tensors,
            block_to_stage,
        )

    def create_simplified_graph(self, graph_config: GraphConfig):
        new_graph = PipelineGraph.create_graph()
        new_graph._io_buffer_mapping = self.graph._io_buffer_mapping
        layer_mapping = graph_config.graph_mapping.layer_mapping

        for i in range(self.network.num_inputs):
            trt_input = self.network.get_input(i)
            if trt_input.name not in layer_mapping:
                new_graph.add_input(trt_input)

        last_blocks = {}
        same_spec_mapping = {}
        same_spec_layer_mapping = {}
        shape_mapping = {}
        building_block_id = 0
        same_spec_ids = {}
        same_spec_count = 0
        for block in self.blocks:
            if not block.is_subset:
                stage_type = None
                if not block.is_superset:
                    if block.block_id == 0:
                        stage_type = StageType.START
                    elif block.block_id == len(self.blocks) - 1:
                        stage_type = StageType.END
                input_mapping = block.get_input_mapping(last_blocks)
                for from_name, to_name in [*input_mapping.items()]:
                    if to_name in same_spec_mapping:
                        input_mapping[from_name] = same_spec_mapping[to_name]
                    if to_name in layer_mapping:
                        input_mapping[from_name] = layer_mapping[to_name]
                if block.is_superset and block.p2p_type is not None:
                    for from_name, to_name in [*input_mapping.items()]:
                        output_tensor = new_graph.get_tensor(to_name)
                        p2p_layer = new_graph.as_trt().add_identity(
                            output_tensor.as_trt())
                        p2p_layer.name = f"p2p_block{block.block_id}_{from_name}"
                        p2p_tensor = p2p_layer.get_output(0)
                        p2p_tensor.name = f"{p2p_layer.name}_output"
                        wrapped_layer = new_graph.register_layer(p2p_layer)
                        wrapped_layer.attrs[
                            "building_block_id"] = building_block_id
                        wrapped_layer.attrs["p2p_type"] = block.p2p_type
                        input_mapping[from_name] = p2p_tensor.name
                        shape_mapping[p2p_tensor.name] = from_name
                    building_block_id += 1
                for i in block.sorted_layer_ids:
                    layer = self.network.get_layer(i)
                    wrapped_layer = new_graph.add_layer(
                        layer,
                        input_mapping=input_mapping,
                    )
                    wrapped_layer.attrs["building_block_id"] = building_block_id
                    wrapped_layer.attrs["stage_type"] = stage_type
                if block.is_superset:
                    last_blocks[block.type_id] = block

                    if block.type_id in same_spec_ids:
                        same_spec_id = same_spec_ids[block.type_id]
                        update_same_spec_count = False
                    else:
                        same_spec_id = same_spec_count
                        same_spec_ids[block.type_id] = same_spec_id
                        update_same_spec_count = True
                    count = same_spec_id
                    for i, (layer_offset,
                            output_index) in enumerate(block.outputs):
                        layer = self.network.get_layer(block.layer_range.start +
                                                       layer_offset)
                        tensor_name = layer.get_output(output_index).name
                        output_tensor = new_graph.get_tensor(tensor_name)
                        same_spec_layer = new_graph.as_trt().add_identity(
                            output_tensor.as_trt())
                        same_spec_layer.name = f"{tensor_name}_same_spec"
                        same_spec_tensor = same_spec_layer.get_output(0)
                        same_spec_tensor.name = f"{same_spec_layer.name}_output"
                        wrapped_layer = new_graph.register_layer(
                            same_spec_layer)
                        wrapped_layer.attrs[
                            "building_block_id"] = building_block_id
                        wrapped_layer.attrs["same_spec_id"] = count
                        count += 1
                        same_spec_mapping[tensor_name] = same_spec_tensor.name
                        same_spec_layer_mapping[
                            same_spec_layer.name] = layer.name
                        shape_mapping[same_spec_tensor.name] = tensor_name
                    for i, graph_input_index in enumerate(
                            block.owned_inputs.keys()):
                        input_name = self.network.get_input(
                            graph_input_index).name
                        input_tensor = new_graph.get_input(input_name)
                        input_tensor.attrs["same_spec_id"] = count
                        count += 1
                    if update_same_spec_count:
                        same_spec_count = count
                building_block_id += 1
        graph_config.graph_mapping.same_spec_layer_mapping = same_spec_layer_mapping

        if len(self.backbone_blocks) >= 2:
            start_block = self.backbone_blocks[0]
            if start_block.is_subset:
                start_block = self.blocks[graph_config.graph_mapping.
                                          block_mapping[start_block.block_id]]
            for i in start_block.layer_range:
                layer_name = self.network.get_layer(i).name
                layer = new_graph.get_layer(layer_name)
                layer.attrs["in_start_block"] = True
            end_block = self.backbone_blocks[-1]
            if end_block.is_subset:
                end_block = self.blocks[graph_config.graph_mapping.
                                        block_mapping[end_block.block_id]]
            for i in end_block.layer_range:
                layer_name = self.network.get_layer(i).name
                layer = new_graph.get_layer(layer_name)
                layer.attrs["in_end_block"] = True
        slowest_p2p_type = None
        if graph_config.has_cross_host:
            slowest_p2p_type = P2PType.CROSS_HOST
        elif graph_config.has_cross_device:
            slowest_p2p_type = P2PType.CROSS_DEVICE
        if slowest_p2p_type is not None:
            for block in self.blocks:
                if block.is_superset and block.p2p_type == slowest_p2p_type:
                    for i in block.layer_range:
                        layer_name = self.network.get_layer(i).name
                        layer = new_graph.get_layer(layer_name)
                        layer.attrs["in_slowest_block"] = True

        for i in range(self.network.num_outputs):
            trt_output = self.network.get_output(i)
            output = self.graph.get_output(trt_output.name)
            if output.producer is not None and output.producer.index in self.layer_to_block and self.layer_to_block[
                    output.producer.index].is_subset:
                continue
            if trt_output.is_shape_tensor:
                new_output = new_graph.add_output_shape(trt_output)
            else:
                new_output = new_graph.add_output(trt_output)
            sharded_io = False
            for pattern in self.sharded_io_allowlist:
                if re.match(pattern, new_output.name):
                    sharded_io = True
                    break
            if not sharded_io:
                new_output.producer.attrs["is_replicated"] = True

        for input in new_graph.inputs:
            input_name = input.name
            sharded_io = False
            for pattern in self.sharded_io_allowlist:
                if re.match(pattern, input_name):
                    sharded_io = True
                    break
            if not sharded_io:
                input.attrs["is_replicated"] = True
            for pattern, repl in self.same_spec_io.items():
                if re.match(pattern, input_name):
                    output_name = re.sub(pattern, repl, input_name)
                    output = new_graph.get_output(output_name)
                    if output is not None:
                        if "same_spec_id" in input.attrs:
                            same_spec_id = input.attrs["same_spec_id"]
                        else:
                            same_spec_id = same_spec_count
                            same_spec_count += 1
                            input.attrs["same_spec_id"] = same_spec_id
                        output.attrs["same_spec_id"] = same_spec_id
                        if math.prod(self.graph.get_input(
                                input_name).shape) < math.prod(
                                    self.graph.get_output(output_name).shape):
                            input.attrs["no_memory_footprint"] = True
                        else:
                            output.attrs["no_memory_footprint"] = True

        return new_graph, shape_mapping

    def enrich_shape_info(self, shape_mapping):
        shapes = self.shape_info.shapes.copy()
        max_shapes = self.shape_info.max_shapes.copy()
        values = self.shape_info.values.copy()
        shape_layers = self.shape_info.shape_layers
        for from_name, to_name in shape_mapping.items():
            if to_name in shapes:
                shapes[from_name] = shapes[to_name]
            if to_name in max_shapes:
                max_shapes[from_name] = max_shapes[to_name]
            if to_name in values:
                values[from_name] = values[to_name]
        shape_info = ShapeInfo(shapes, values, shape_layers, max_shapes)
        return shape_info

    def simplify_graph(
            self, phy_mesh: PhysicalDeviceMesh, num_stages: int,
            num_devices_per_stage: int) -> Tuple[PipelineGraph, GraphConfig]:
        num_blocks = len(self.backbone_blocks)
        if num_blocks % num_stages != 0:
            return None, None
        graph_config = GraphConfig()
        graph_config.num_micro_batches = self.num_micro_batches
        graph_config.num_blocks = num_blocks
        graph_config.num_stages = num_stages
        graph_config.phy_mesh = phy_mesh
        stage_phy_meshes = phy_mesh.split_pipeline_meshes(
            num_stages, num_devices_per_stage)
        graph_config.stage_phy_meshes = stage_phy_meshes
        with silent_trt_logger():
            self.clean_blocks()
            self.mark_p2p_type(phy_mesh, stage_phy_meshes, graph_config)
            graph_config.graph_mapping = self.get_graph_mapping()
            new_graph, shape_mapping = self.create_simplified_graph(
                graph_config)
            shape_info = self.enrich_shape_info(shape_mapping)
            new_graph.assign_shapes(shape_info)
            return new_graph, graph_config

    def get_graph_mapping_for_shape(self):
        layer_mapping = {}
        tensor_mapping = {}
        for block_list in self.blocks_by_edge_hash.values():
            head_block = block_list[0]
            for block in block_list[1:]:
                for from_layer_id, to_layer_id in zip(block.layer_range,
                                                      head_block.layer_range):
                    from_layer = self.network.get_layer(from_layer_id)
                    to_layer = self.network.get_layer(to_layer_id)
                    layer_mapping[from_layer.name] = to_layer.name
                    for i in range(from_layer.num_outputs):
                        tensor_mapping[from_layer.get_output(
                            i).name] = to_layer.get_output(i).name
        return layer_mapping, tensor_mapping

    def create_simplified_graph_for_shape(self):
        new_graph = PipelineGraph.create_graph()

        for i in range(self.network.num_inputs):
            trt_input = self.network.get_input(i)
            new_graph.add_input(trt_input)

        head_blocks = {}
        removed_blocks = set()
        removed_layers = set()
        for block_list in self.blocks_by_edge_hash.values():
            head_block = block_list[0]
            head_blocks[head_block.type_id] = head_block
            for block in block_list[1:]:
                removed_blocks.add(id(block))
                for layer_index in block.layer_range:
                    removed_layers.add(layer_index)

        for block in self.blocks:
            if not id(block) in removed_blocks:
                input_mapping = block.get_input_mapping(head_blocks)
                for i in block.sorted_layer_ids:
                    layer = self.network.get_layer(i)
                    new_graph.add_layer(
                        layer,
                        input_mapping=input_mapping,
                    )

        for i in range(self.network.num_outputs):
            trt_output = self.network.get_output(i)
            output = self.graph.get_output(trt_output.name)
            if output.producer is not None and output.producer.index in removed_layers:
                continue
            if trt_output.is_shape_tensor:
                new_graph.add_output_shape(trt_output)
            else:
                new_graph.add_output(trt_output)

        return new_graph

    def get_full_shape_info(self, num_micro_batches):
        layer_mapping, tensor_mapping = self.graph_mapping_for_shape
        optimization_profiles = self.llm_network._generate_optimization_profiles(
        )
        if len(optimization_profiles) > 0:
            optimization_profile = optimization_profiles[-1]
        else:
            optimization_profile = None
        shape_info = get_shape_info(self.graph_for_shape.as_trt(),
                                    optimization_profile)
        max_shape_info = get_shape_info(self.graph_for_shape.as_trt(),
                                        optimization_profile,
                                        shape_type=ShapeType.MAX)
        shape_info.max_shapes = max_shape_info.shapes
        for removed_tensor_name, tensor_name in tensor_mapping.items():
            shape_info.shapes[removed_tensor_name] = shape_info.shapes[
                tensor_name]
            shape_info.max_shapes[removed_tensor_name] = shape_info.max_shapes[
                tensor_name]
            if tensor_name in shape_info.values:
                shape_info.values[removed_tensor_name] = shape_info.values[
                    tensor_name]
        for removed_layer_name, layer_name in layer_mapping.items():
            if layer_name in shape_info.shape_layers:
                shape_info.shape_layers.add(removed_layer_name)
        return shape_info

    def init_layer_hash(self):
        with silent_trt_logger():
            optimization_profiles = self.llm_network._generate_optimization_profiles(
            )
            if len(optimization_profiles) > 0:
                optimization_profile = optimization_profiles[-1]
            else:
                optimization_profile = None
            shape_info = get_shape_info(self.network, optimization_profile)
        dtypes = {tensor.name: tensor.dtype for tensor in self.graph.tensors}
        for layer in self.graph.layers:
            layer_hash = get_cache_key(
                layer.as_trt(),
                shape_info.shapes,
                shape_info.values,
                dtypes,
            )
            layer.attrs["hash"] = layer_hash