[TRTLLM-8684][chore] Migrate BuildConfig to Pydantic, add a Python wrapper for KVCacheType enum (#8330)

Signed-off-by: Anish Shanbhag <ashanbhag@nvidia.com>
2026-01-14 06:27:45 +08:00 · 2025-10-28 09:17:26 -07:00 · 2025-10-28 09:17:26 -07:00 · a09b38a862
commit a09b38a862
parent cdc9e5e645
32 changed files with 363 additions and 429 deletions
--- a/examples/models/core/llama/summarize_long.py
+++ b/examples/models/core/llama/summarize_long.py
@ -23,7 +23,7 @@ from transformers import AutoModelForCausalLM, LlamaTokenizer
 import tensorrt_llm
 import tensorrt_llm.profiler as profiler
-from tensorrt_llm.bindings import KVCacheType
+from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
 from tensorrt_llm.logger import logger
 from tensorrt_llm.quantization import QuantMode
@ -97,7 +97,7 @@ def TRTLLaMA(args, config):
    quantization_config = pretrained_config['quantization']
    build_config = config['build_config']
-    kv_cache_type = KVCacheType.from_string(build_config['kv_cache_type'])
+    kv_cache_type = KVCacheType(build_config['kv_cache_type'])
    plugin_config = build_config['plugin_config']
    dtype = pretrained_config['dtype']
--- a/examples/models/core/qwen2audio/run.py
+++ b/examples/models/core/qwen2audio/run.py
@ -27,7 +27,7 @@ from utils import add_common_args
 import tensorrt_llm
 import tensorrt_llm.profiler as profiler
 from tensorrt_llm import logger
-from tensorrt_llm.bindings import KVCacheType
+from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
 from tensorrt_llm.quantization import QuantMode
 from tensorrt_llm.runtime import (PYTHON_BINDINGS, ModelConfig, ModelRunner,
                                  SamplingConfig, Session, TensorInfo)
@ -122,8 +122,7 @@ class QWenInfer(object):
        num_kv_heads = config["pretrained_config"].get("num_key_value_heads",
                                                       num_heads)
        if "kv_cache_type" in config["build_config"]:
-            kv_cache_type = KVCacheType.from_string(
+            kv_cache_type = KVCacheType(config["build_config"]["kv_cache_type"])
                config["build_config"]["kv_cache_type"])
        else:
            kv_cache_type = KVCacheType.CONTINUOUS
--- a/examples/models/core/qwenvl/run.py
+++ b/examples/models/core/qwenvl/run.py
@ -25,7 +25,7 @@ from vit_onnx_trt import Preprocss
 import tensorrt_llm
 import tensorrt_llm.profiler as profiler
 from tensorrt_llm import logger
-from tensorrt_llm.bindings import KVCacheType
+from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
 from tensorrt_llm.quantization import QuantMode
 from tensorrt_llm.runtime import (ModelConfig, SamplingConfig, Session,
                                  TensorInfo)
@ -118,8 +118,7 @@ class QWenInfer(object):
        num_kv_heads = config["pretrained_config"].get("num_key_value_heads",
                                                       num_heads)
        if "kv_cache_type" in config["build_config"]:
-            kv_cache_type = KVCacheType.from_string(
+            kv_cache_type = KVCacheType(config["build_config"]["kv_cache_type"])
                config["build_config"]["kv_cache_type"])
        else:
            kv_cache_type = KVCacheType.CONTINUOUS
--- a/examples/models/core/whisper/run.py
+++ b/examples/models/core/whisper/run.py
@ -33,7 +33,8 @@ import tensorrt_llm
 import tensorrt_llm.logger as logger
 from tensorrt_llm._utils import (str_dtype_to_torch, str_dtype_to_trt,
                                 trt_dtype_to_torch)
-from tensorrt_llm.bindings import GptJsonConfig, KVCacheType
+from tensorrt_llm.bindings import GptJsonConfig
 from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
 from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelConfig, SamplingConfig
 from tensorrt_llm.runtime.session import Session, TensorInfo
--- a/tensorrt_llm/_torch/auto_deploy/llm_args.py
+++ b/tensorrt_llm/_torch/auto_deploy/llm_args.py
@ -9,7 +9,6 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
 from tensorrt_llm.models.modeling_utils import QuantConfig
 from ...llmapi.llm_args import BaseLlmArgs, BuildConfig, KvCacheConfig, _ParallelConfig
 from ...llmapi.utils import get_type_repr
 from .models import ModelFactory, ModelFactoryRegistry
 from .utils._config import DynamicYamlMixInForSettings
 from .utils.logger import ad_logger
@ -318,12 +317,11 @@ class LlmArgs(AutoDeployConfig, BaseLlmArgs, BaseSettings):
    model_config = _get_config_dict()
-    build_config: Optional[object] = Field(
+    build_config: Optional[BuildConfig] = Field(
-        default_factory=lambda: BuildConfig(),
+        default_factory=BuildConfig,
        description="!!! DO NOT USE !!! Internal only; needed for BaseLlmArgs compatibility.",
        exclude_from_json=True,
        frozen=True,
        json_schema_extra={"type": f"Optional[{get_type_repr(BuildConfig)}]"},
        repr=False,
    )
    backend: Literal["_autodeploy"] = Field(
--- a/tensorrt_llm/bench/build/build.py
+++ b/tensorrt_llm/bench/build/build.py
@ -22,8 +22,8 @@ TUNED_QUANTS = {
    QuantAlgo.NVFP4, QuantAlgo.FP8, QuantAlgo.FP8_BLOCK_SCALES,
    QuantAlgo.NO_QUANT, None
 }
-DEFAULT_MAX_BATCH_SIZE = BuildConfig.max_batch_size
+DEFAULT_MAX_BATCH_SIZE = BuildConfig.model_fields["max_batch_size"].default
-DEFAULT_MAX_NUM_TOKENS = BuildConfig.max_num_tokens
+DEFAULT_MAX_NUM_TOKENS = BuildConfig.model_fields["max_num_tokens"].default
 def get_benchmark_engine_settings(
--- a/tensorrt_llm/builder.py
+++ b/tensorrt_llm/builder.py
@ -12,27 +12,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
 import dataclasses
 import json
 import math
 import os
 import shutil
 import time
 from dataclasses import dataclass, field
 from functools import cache
 from pathlib import Path
 from typing import Dict, Optional, Union
 import numpy as np
 import tensorrt as trt
 from pydantic import BaseModel, Field
 from ._common import _is_building, check_max_num_tokens, serialize_engine
 from ._utils import (get_sm_version, np_bfloat16, np_float8, str_dtype_to_trt,
                     to_json_file, trt_gte)
 from .bindings import KVCacheType
 from .functional import PositionEmbeddingType
 from .graph_rewriting import optimize
 from .llmapi.kv_cache_type import KVCacheType
 from .logger import logger
 from .lora_helper import LoraConfig
 from .models import PretrainedConfig, PretrainedModel
@ -46,10 +43,7 @@ from .version import __version__
 class ConfigEncoder(json.JSONEncoder):
    def default(self, obj):
-        if isinstance(obj, KVCacheType):
+        if hasattr(obj, 'model_dump'):
            # For KVCacheType, convert it to string by split of 'KVCacheType.PAGED'.
            return obj.__str__().split('.')[-1]
        elif hasattr(obj, 'model_dump'):
            # Handle Pydantic models (including DecodingBaseConfig and subclasses)
            return obj.model_dump(mode='json')
        else:
@ -456,75 +450,112 @@ class Builder():
        logger.info(f'Config saved to {config_path}.')
-@dataclass
+class BuildConfig(BaseModel):
 class BuildConfig:
    """Configuration class for TensorRT LLM engine building parameters.
    This class contains all the configuration parameters needed to build a TensorRT LLM engine,
    including sequence length limits, batch sizes, optimization settings, and various features.
    Args:
        max_input_len (int): Maximum length of input sequences. Defaults to 1024.
        max_seq_len (int, optional): The maximum possible sequence length for a single request, including both input and generated output tokens. Defaults to None.
        opt_batch_size (int): Optimal batch size for engine optimization. Defaults to 8.
        max_batch_size (int): Maximum batch size the engine can handle. Defaults to 2048.
        max_beam_width (int): Maximum beam width for beam search decoding. Defaults to 1.
        max_num_tokens (int): Maximum number of batched input tokens after padding is removed in each batch. Defaults to 8192.
        opt_num_tokens (int, optional): Optimal number of batched input tokens for engine optimization. Defaults to None.
        max_prompt_embedding_table_size (int): Maximum size of prompt embedding table for prompt tuning. Defaults to 0.
        kv_cache_type (KVCacheType, optional): Type of KV cache to use (CONTINUOUS or PAGED). If None, defaults to PAGED. Defaults to None.
        gather_context_logits (int): Whether to gather logits during context phase. Defaults to False.
        gather_generation_logits (int): Whether to gather logits during generation phase. Defaults to False.
        strongly_typed (bool): Whether to use strongly_typed. Defaults to True.
        force_num_profiles (int, optional): Force a specific number of optimization profiles. If None, auto-determined. Defaults to None.
        profiling_verbosity (str): Verbosity level for TensorRT profiling ('layer_names_only', 'detailed', 'none'). Defaults to 'layer_names_only'.
        enable_debug_output (bool): Whether to enable debug output during building. Defaults to False.
        max_draft_len (int): Maximum length of draft tokens for speculative decoding. Defaults to 0.
        speculative_decoding_mode (SpeculativeDecodingMode): Mode for speculative decoding (NONE, MEDUSA, EAGLE, etc.). Defaults to SpeculativeDecodingMode.NONE.
        use_refit (bool): Whether to enable engine refitting capabilities. Defaults to False.
        input_timing_cache (str, optional): Path to input timing cache file. If None, no input cache used. Defaults to None.
        output_timing_cache (str): Path to output timing cache file. Defaults to 'model.cache'.
        lora_config (LoraConfig): Configuration for LoRA (Low-Rank Adaptation) fine-tuning. Defaults to default LoraConfig.
        weight_sparsity (bool): Whether to enable weight sparsity optimization. Defaults to False.
        weight_streaming (bool): Whether to enable weight streaming for large models. Defaults to False.
        plugin_config (PluginConfig): Configuration for TensorRT LLM plugins. Defaults to default PluginConfig.
        use_strip_plan (bool): Whether to use stripped plan for engine building. Defaults to False.
        max_encoder_input_len (int): Maximum encoder input length for encoder-decoder models. Defaults to 1024.
        dry_run (bool): Whether to perform a dry run without actually building the engine. Defaults to False.
        visualize_network (str, optional): Path to save network visualization. If None, no visualization generated. Defaults to None.
        monitor_memory (bool): Whether to monitor memory usage during building. Defaults to False.
        use_mrope (bool): Whether to use Multi-RoPE (Rotary Position Embedding) optimization. Defaults to False.
    """
-    max_input_len: int = 1024
+    max_input_len: int = Field(default=1024,
-    max_seq_len: int = None
+                               description="Maximum length of input sequences.")
-    opt_batch_size: int = 8
+    max_seq_len: Optional[int] = Field(
-    max_batch_size: int = 2048
+        default=None,
-    max_beam_width: int = 1
+        description=
-    max_num_tokens: int = 8192
+        "The maximum possible sequence length for a single request, including both input and generated "
-    opt_num_tokens: Optional[int] = None
+        "output tokens.")
-    max_prompt_embedding_table_size: int = 0
+    opt_batch_size: int = Field(
-    kv_cache_type: KVCacheType = None
+        default=8, description="Optimal batch size for engine optimization.")
-    gather_context_logits: int = False
+    max_batch_size: int = Field(
-    gather_generation_logits: int = False
+        default=2048, description="Maximum batch size the engine can handle.")
-    strongly_typed: bool = True
+    max_beam_width: int = Field(
-    force_num_profiles: Optional[int] = None
+        default=1, description="Maximum beam width for beam search decoding.")
-    profiling_verbosity: str = 'layer_names_only'
+    max_num_tokens: int = Field(
-    enable_debug_output: bool = False
+        default=8192,
-    max_draft_len: int = 0
+        description="Maximum number of batched input tokens after padding is "
-    speculative_decoding_mode: SpeculativeDecodingMode = SpeculativeDecodingMode.NONE
+        "removed in each batch.")
-    use_refit: bool = False
+    opt_num_tokens: Optional[int] = Field(
-    input_timing_cache: str = None
+        default=None,
-    output_timing_cache: str = 'model.cache'
+        description=
-    lora_config: LoraConfig = field(default_factory=LoraConfig)
+        "Optimal number of batched input tokens for engine optimization.")
-    weight_sparsity: bool = False
+    max_prompt_embedding_table_size: int = Field(
-    weight_streaming: bool = False
+        default=0,
-    plugin_config: PluginConfig = field(default_factory=PluginConfig)
+        description="Maximum size of prompt embedding table for prompt tuning.")
-    use_strip_plan: bool = False
+    kv_cache_type: Optional[KVCacheType] = Field(
-    max_encoder_input_len: int = 1024  # for enc-dec DecoderModel
+        default=None,
-    dry_run: bool = False
+        description=
-    visualize_network: str = None
+        "Type of KV cache to use (CONTINUOUS or PAGED). If None, defaults to PAGED."
-    monitor_memory: bool = False
+    )
-    use_mrope: bool = False
+    gather_context_logits: bool = Field(
        default=False,
        description="Whether to gather logits during context phase.")
    gather_generation_logits: bool = Field(
        default=False,
        description="Whether to gather logits during generation phase.")
    strongly_typed: bool = Field(default=True,
                                 description="Whether to use strongly_typed.")
    force_num_profiles: Optional[int] = Field(
        default=None,
        description=
        "Force a specific number of optimization profiles. If None, auto-determined."
    )
    profiling_verbosity: str = Field(
        default='layer_names_only',
        description=
        "Verbosity level for TensorRT profiling ('layer_names_only', 'detailed', 'none')."
    )
    enable_debug_output: bool = Field(
        default=False,
        description="Whether to enable debug output during building.")
    max_draft_len: int = Field(
        default=0,
        description="Maximum length of draft tokens for speculative decoding.")
    speculative_decoding_mode: SpeculativeDecodingMode = Field(
        default=SpeculativeDecodingMode.NONE,
        description="Mode for speculative decoding (NONE, MEDUSA, EAGLE, etc.)."
    )
    use_refit: bool = Field(
        default=False,
        description="Whether to enable engine refitting capabilities.")
    input_timing_cache: Optional[str] = Field(
        default=None,
        description=
        "Path to input timing cache file. If None, no input cache used.")
    output_timing_cache: str = Field(
        default='model.cache', description="Path to output timing cache file.")
    lora_config: LoraConfig = Field(
        default_factory=LoraConfig,
        description="Configuration for LoRA (Low-Rank Adaptation) fine-tuning.")
    weight_sparsity: bool = Field(
        default=False,
        description="Whether to enable weight sparsity optimization.")
    weight_streaming: bool = Field(
        default=False,
        description="Whether to enable weight streaming for large models.")
    plugin_config: PluginConfig = Field(
        default_factory=PluginConfig,
        description="Configuration for TensorRT LLM plugins.")
    use_strip_plan: bool = Field(
        default=False,
        description="Whether to use stripped plan for engine building.")
    max_encoder_input_len: int = Field(
        default=1024,
        description="Maximum encoder input length for encoder-decoder models.")
    dry_run: bool = Field(
        default=False,
        description=
        "Whether to perform a dry run without actually building the engine.")
    visualize_network: Optional[str] = Field(
        default=None,
        description=
        "Path to save network visualization. If None, no visualization generated."
    )
    monitor_memory: bool = Field(
        default=False,
        description="Whether to monitor memory usage during building.")
    use_mrope: bool = Field(
        default=False,
        description=
        "Whether to use Multi-RoPE (Rotary Position Embedding) optimization.")
    # Since we have some overlapping between kv_cache_type, paged_kv_cache, and paged_state (later two will be deprecated in the future),
    # we need to handle it given model architecture.
@ -574,144 +605,10 @@ class BuildConfig:
            override_attri('paged_state', False)
    @classmethod
-    @cache
+    def from_json_file(cls, config_file):
    def get_build_config_defaults(cls):
        return {
            field.name: field.default
            for field in dataclasses.fields(cls)
            if field.default is not dataclasses.MISSING
        }
    @classmethod
    def from_dict(cls, config, plugin_config=None):
        config = copy.deepcopy(
            config
        )  # it just does not make sense to change the input arg `config`
        defaults = cls.get_build_config_defaults()
        max_input_len = config.pop('max_input_len',
                                   defaults.get('max_input_len'))
        max_seq_len = config.pop('max_seq_len', defaults.get('max_seq_len'))
        max_batch_size = config.pop('max_batch_size',
                                    defaults.get('max_batch_size'))
        max_beam_width = config.pop('max_beam_width',
                                    defaults.get('max_beam_width'))
        max_num_tokens = config.pop('max_num_tokens',
                                    defaults.get('max_num_tokens'))
        opt_num_tokens = config.pop('opt_num_tokens',
                                    defaults.get('opt_num_tokens'))
        opt_batch_size = config.pop('opt_batch_size',
                                    defaults.get('opt_batch_size'))
        max_prompt_embedding_table_size = config.pop(
            'max_prompt_embedding_table_size',
            defaults.get('max_prompt_embedding_table_size'))
        if "kv_cache_type" in config and config["kv_cache_type"] is not None:
            kv_cache_type = KVCacheType.from_string(config.pop('kv_cache_type'))
        else:
            kv_cache_type = None
        gather_context_logits = config.pop(
            'gather_context_logits', defaults.get('gather_context_logits'))
        gather_generation_logits = config.pop(
            'gather_generation_logits',
            defaults.get('gather_generation_logits'))
        strongly_typed = config.pop('strongly_typed',
                                    defaults.get('strongly_typed'))
        force_num_profiles = config.pop('force_num_profiles',
                                        defaults.get('force_num_profiles'))
        weight_sparsity = config.pop('weight_sparsity',
                                     defaults.get('weight_sparsity'))
        profiling_verbosity = config.pop('profiling_verbosity',
                                         defaults.get('profiling_verbosity'))
        enable_debug_output = config.pop('enable_debug_output',
                                         defaults.get('enable_debug_output'))
        max_draft_len = config.pop('max_draft_len',
                                   defaults.get('max_draft_len'))
        speculative_decoding_mode = config.pop(
            'speculative_decoding_mode',
            defaults.get('speculative_decoding_mode'))
        use_refit = config.pop('use_refit', defaults.get('use_refit'))
        input_timing_cache = config.pop('input_timing_cache',
                                        defaults.get('input_timing_cache'))
        output_timing_cache = config.pop('output_timing_cache',
                                         defaults.get('output_timing_cache'))
        lora_config = LoraConfig(**config.get('lora_config', {}))
        max_encoder_input_len = config.pop(
            'max_encoder_input_len', defaults.get('max_encoder_input_len'))
        weight_streaming = config.pop('weight_streaming',
                                      defaults.get('weight_streaming'))
        use_strip_plan = config.pop('use_strip_plan',
                                    defaults.get('use_strip_plan'))
        if plugin_config is None:
            plugin_config = PluginConfig()
        if "plugin_config" in config.keys():
            plugin_config = plugin_config.model_copy(
                update=config["plugin_config"], deep=True)
        dry_run = config.pop('dry_run', defaults.get('dry_run'))
        visualize_network = config.pop('visualize_network',
                                       defaults.get('visualize_network'))
        monitor_memory = config.pop('monitor_memory',
                                    defaults.get('monitor_memory'))
        use_mrope = config.pop('use_mrope', defaults.get('use_mrope'))
        return cls(
            max_input_len=max_input_len,
            max_seq_len=max_seq_len,
            max_batch_size=max_batch_size,
            max_beam_width=max_beam_width,
            max_num_tokens=max_num_tokens,
            opt_num_tokens=opt_num_tokens,
            opt_batch_size=opt_batch_size,
            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
            kv_cache_type=kv_cache_type,
            gather_context_logits=gather_context_logits,
            gather_generation_logits=gather_generation_logits,
            strongly_typed=strongly_typed,
            force_num_profiles=force_num_profiles,
            profiling_verbosity=profiling_verbosity,
            enable_debug_output=enable_debug_output,
            max_draft_len=max_draft_len,
            speculative_decoding_mode=speculative_decoding_mode,
            use_refit=use_refit,
            input_timing_cache=input_timing_cache,
            output_timing_cache=output_timing_cache,
            lora_config=lora_config,
            use_strip_plan=use_strip_plan,
            max_encoder_input_len=max_encoder_input_len,
            weight_sparsity=weight_sparsity,
            weight_streaming=weight_streaming,
            plugin_config=plugin_config,
            dry_run=dry_run,
            visualize_network=visualize_network,
            monitor_memory=monitor_memory,
            use_mrope=use_mrope)
    @classmethod
    def from_json_file(cls, config_file, plugin_config=None):
        with open(config_file) as f:
            config = json.load(f)
-            return BuildConfig.from_dict(config, plugin_config=plugin_config)
+            return BuildConfig(**config)
    def to_dict(self):
        output = copy.deepcopy(self.__dict__)
        # the enum KVCacheType cannot be converted automatically
        if output.get('kv_cache_type', None) is not None:
            output['kv_cache_type'] = str(output['kv_cache_type'].name)
        output['plugin_config'] = output['plugin_config'].model_dump()
        output['lora_config'] = output['lora_config'].model_dump()
        return output
    def update_from_dict(self, config: dict):
        for name, value in config.items():
            if not hasattr(self, name):
                raise AttributeError(
                    f"{self.__class__} object has no attribute {name}")
            setattr(self, name, value)
    def update(self, **kwargs):
        self.update_from_dict(kwargs)
 class EngineConfig:
@ -731,11 +628,10 @@ class EngineConfig:
    def from_json_str(cls, config_str):
        config = json.loads(config_str)
        return cls(PretrainedConfig.from_dict(config['pretrained_config']),
-                   BuildConfig.from_dict(config['build_config']),
+                   BuildConfig(**config['build_config']), config['version'])
                   config['version'])
    def to_dict(self):
-        build_config = self.build_config.to_dict()
+        build_config = self.build_config.model_dump(mode="json")
        build_config.pop('dry_run', None)  # Not an Engine Characteristic
        build_config.pop('visualize_network',
                         None)  # Not an Engine Characteristic
@ -1081,7 +977,7 @@ def build(model: PretrainedModel, build_config: BuildConfig) -> Engine:
    '''
    tic = time.time()
    # avoid changing the input config
-    build_config = copy.deepcopy(build_config)
+    build_config = build_config.model_copy(deep=True)
    build_config.plugin_config.dtype = model.config.dtype
    build_config.update_kv_cache_type(model.config.architecture)
--- a/tensorrt_llm/commands/build.py
+++ b/tensorrt_llm/commands/build.py
@ -26,8 +26,8 @@ import torch
 from tensorrt_llm._utils import (local_mpi_rank, local_mpi_size, mpi_barrier,
                                 mpi_comm, mpi_rank, mpi_world_size)
 from tensorrt_llm.bindings import KVCacheType
 from tensorrt_llm.builder import BuildConfig, Engine, build
 from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
 from tensorrt_llm.logger import logger, severity_map
 from tensorrt_llm.lora_helper import LoraConfig
 from tensorrt_llm.lora_manager import LoraManager
@ -37,23 +37,6 @@ from tensorrt_llm.plugin import PluginConfig, add_plugin_argument
 from tensorrt_llm.quantization.mode import QuantAlgo
 def enum_type(enum_class):
    def parse_enum(value):
        if isinstance(value, enum_class):
            return value
        if isinstance(value, str):
            return enum_class.from_string(value)
        valid_values = [e.name for e in enum_class]
        raise argparse.ArgumentTypeError(
            f"Invalid value '{value}' of type {type(value).__name__}. Expected one of {valid_values}"
        )
    return parse_enum
 def parse_arguments():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@ -92,29 +75,30 @@ def parse_arguments():
    parser.add_argument(
        '--max_batch_size',
        type=int,
-        default=BuildConfig.max_batch_size,
+        default=BuildConfig.model_fields["max_batch_size"].default,
        help="Maximum number of requests that the engine can schedule.")
-    parser.add_argument('--max_input_len',
+    parser.add_argument(
-                        type=int,
+        '--max_input_len',
-                        default=BuildConfig.max_input_len,
+        type=int,
-                        help="Maximum input length of one request.")
+        default=BuildConfig.model_fields["max_input_len"].default,
        help="Maximum input length of one request.")
    parser.add_argument(
        '--max_seq_len',
        '--max_decoder_seq_len',
        dest='max_seq_len',
        type=int,
-        default=BuildConfig.max_seq_len,
+        default=BuildConfig.model_fields["max_seq_len"].default,
        help="Maximum total length of one request, including prompt and outputs. "
        "If unspecified, the value is deduced from the model config.")
    parser.add_argument(
        '--max_beam_width',
        type=int,
-        default=BuildConfig.max_beam_width,
+        default=BuildConfig.model_fields["max_beam_width"].default,
        help="Maximum number of beams for beam search decoding.")
    parser.add_argument(
        '--max_num_tokens',
        type=int,
-        default=BuildConfig.max_num_tokens,
+        default=BuildConfig.model_fields["max_num_tokens"].default,
        help=
        "Maximum number of batched input tokens after padding is removed in each batch. "
        "Currently, the input padding is removed by default; "
@ -123,7 +107,7 @@ def parse_arguments():
    parser.add_argument(
        '--opt_num_tokens',
        type=int,
-        default=BuildConfig.opt_num_tokens,
+        default=BuildConfig.model_fields["opt_num_tokens"].default,
        help=
        "Optimal number of batched input tokens after padding is removed in each batch "
        "It equals to ``max_batch_size * max_beam_width`` by default, set this "
@ -132,7 +116,7 @@ def parse_arguments():
    parser.add_argument(
        '--max_encoder_input_len',
        type=int,
-        default=BuildConfig.max_encoder_input_len,
+        default=BuildConfig.model_fields["max_encoder_input_len"].default,
        help="Maximum encoder input length for enc-dec models. "
        "Set ``max_input_len`` to 1 to start generation from decoder_start_token_id of length 1."
    )
@ -140,14 +124,15 @@ def parse_arguments():
        '--max_prompt_embedding_table_size',
        '--max_multimodal_len',
        type=int,
-        default=BuildConfig.max_prompt_embedding_table_size,
+        default=BuildConfig.model_fields["max_prompt_embedding_table_size"].
        default,
        help=
        "Maximum prompt embedding table size for prompt tuning, or maximum multimodal input size for multimodal models. "
        "Setting a value > 0 enables prompt tuning or multimodal input.")
    parser.add_argument(
        '--kv_cache_type',
        default=argparse.SUPPRESS,
-        type=enum_type(KVCacheType),
+        type=KVCacheType,
        help=
        "Set KV cache type (continuous, paged, or disabled). For disabled case, KV cache is disabled and only context phase is allowed."
    )
@ -156,42 +141,44 @@ def parse_arguments():
        type=str,
        default=argparse.SUPPRESS,
        help=
-        "Deprecated. Enabling this option is equvilient to ``--kv_cache_type paged`` for transformer based models."
+        "Deprecated. Enabling this option is equivalent to ``--kv_cache_type paged`` for transformer based models."
    )
    parser.add_argument(
        '--input_timing_cache',
        type=str,
-        default=BuildConfig.input_timing_cache,
+        default=BuildConfig.model_fields["input_timing_cache"].default,
        help=
        "The file path to read the timing cache. This option is ignored if the file does not exist."
    )
-    parser.add_argument('--output_timing_cache',
+    parser.add_argument(
-                        type=str,
+        '--output_timing_cache',
-                        default=BuildConfig.output_timing_cache,
+        type=str,
-                        help="The file path to write the timing cache.")
+        default=BuildConfig.model_fields["output_timing_cache"].default,
        help="The file path to write the timing cache.")
    parser.add_argument(
        '--profiling_verbosity',
        type=str,
-        default=BuildConfig.profiling_verbosity,
+        default=BuildConfig.model_fields["profiling_verbosity"].default,
        choices=['layer_names_only', 'detailed', 'none'],
        help=
        "The profiling verbosity for the generated TensorRT engine. Setting to detailed allows inspecting tactic choices and kernel parameters."
    )
    parser.add_argument(
        '--strip_plan',
-        default=BuildConfig.use_strip_plan,
+        default=BuildConfig.model_fields["use_strip_plan"].default,
        action='store_true',
        help=
        "Enable stripping weights from the final TensorRT engine under the assumption that the refit weights are identical to those provided at build time."
    )
-    parser.add_argument('--weight_sparsity',
+    parser.add_argument(
-                        default=BuildConfig.weight_sparsity,
+        '--weight_sparsity',
-                        action='store_true',
+        default=BuildConfig.model_fields["weight_sparsity"].default,
-                        help="Enable weight sparsity.")
+        action='store_true',
        help="Enable weight sparsity.")
    parser.add_argument(
        '--weight_streaming',
-        default=BuildConfig.weight_streaming,
+        default=BuildConfig.model_fields["weight_streaming"].default,
        action='store_true',
        help=
        "Enable offloading weights to CPU and streaming loading at runtime.",
@ -213,10 +200,11 @@ def parse_arguments():
                        default='info',
                        choices=severity_map.keys(),
                        help="The logging level.")
-    parser.add_argument('--enable_debug_output',
+    parser.add_argument(
-                        default=BuildConfig.enable_debug_output,
+        '--enable_debug_output',
-                        action='store_true',
+        default=BuildConfig.model_fields["enable_debug_output"].default,
-                        help="Enable debug output.")
+        action='store_true',
        help="Enable debug output.")
    parser.add_argument(
        '--visualize_network',
        type=str,
@ -226,7 +214,7 @@ def parse_arguments():
    )
    parser.add_argument(
        '--dry_run',
-        default=BuildConfig.dry_run,
+        default=BuildConfig.model_fields["dry_run"].default,
        action='store_true',
        help=
        "Run through the build process except the actual Engine build for debugging."
@ -519,65 +507,37 @@ def main():
                f"Overriding # of builder profiles <= {force_num_profiles_from_env}."
            )
-        build_config = BuildConfig.from_dict(
+        build_config = BuildConfig(
-            {
+            max_input_len=args.max_input_len,
-                'max_input_len':
+            max_seq_len=args.max_seq_len,
-                args.max_input_len,
+            max_batch_size=args.max_batch_size,
-                'max_seq_len':
+            max_beam_width=args.max_beam_width,
-                args.max_seq_len,
+            max_num_tokens=args.max_num_tokens,
-                'max_batch_size':
+            opt_num_tokens=args.opt_num_tokens,
-                args.max_batch_size,
+            max_prompt_embedding_table_size=args.
-                'max_beam_width':
+            max_prompt_embedding_table_size,
-                args.max_beam_width,
+            kv_cache_type=getattr(args, "kv_cache_type", None),
-                'max_num_tokens':
+            gather_context_logits=args.gather_context_logits,
-                args.max_num_tokens,
+            gather_generation_logits=args.gather_generation_logits,
-                'opt_num_tokens':
+            strongly_typed=True,
-                args.opt_num_tokens,
+            force_num_profiles=force_num_profiles_from_env,
-                'max_prompt_embedding_table_size':
+            weight_sparsity=args.weight_sparsity,
-                args.max_prompt_embedding_table_size,
+            profiling_verbosity=args.profiling_verbosity,
-                'gather_context_logits':
+            enable_debug_output=args.enable_debug_output,
-                args.gather_context_logits,
+            max_draft_len=args.max_draft_len,
-                'gather_generation_logits':
+            speculative_decoding_mode=speculative_decoding_mode,
-                args.gather_generation_logits,
+            input_timing_cache=args.input_timing_cache,
-                'strongly_typed':
+            output_timing_cache=args.output_timing_cache,
-                True,
+            dry_run=args.dry_run,
-                'force_num_profiles':
+            visualize_network=args.visualize_network,
-                force_num_profiles_from_env,
+            max_encoder_input_len=args.max_encoder_input_len,
-                'weight_sparsity':
+            weight_streaming=args.weight_streaming,
-                args.weight_sparsity,
+            monitor_memory=args.monitor_memory,
-                'profiling_verbosity':
+            use_mrope=getattr(model_config, "qwen_type", None) == "qwen2_vl",
                args.profiling_verbosity,
                'enable_debug_output':
                args.enable_debug_output,
                'max_draft_len':
                args.max_draft_len,
                'speculative_decoding_mode':
                speculative_decoding_mode,
                'input_timing_cache':
                args.input_timing_cache,
                'output_timing_cache':
                args.output_timing_cache,
                'dry_run':
                args.dry_run,
                'visualize_network':
                args.visualize_network,
                'max_encoder_input_len':
                args.max_encoder_input_len,
                'weight_streaming':
                args.weight_streaming,
                'monitor_memory':
                args.monitor_memory,
                'use_mrope':
                (True if model_config.qwen_type == "qwen2_vl" else False)
                if hasattr(model_config, "qwen_type") else False
            },
            plugin_config=plugin_config)
        if hasattr(args, 'kv_cache_type'):
            build_config.update_from_dict({'kv_cache_type': args.kv_cache_type})
    else:
-        build_config = BuildConfig.from_json_file(args.build_config,
+        build_config = BuildConfig.from_json_file(args.build_config)
-                                                  plugin_config=plugin_config)
+        build_config.plugin_config = plugin_config
    parallel_build(model_config, ckpt_dir, build_config, args.output_dir,
                   workers, args.log_level, model_cls, **kwargs)
--- a/tensorrt_llm/commands/eval.py
+++ b/tensorrt_llm/commands/eval.py
@ -50,23 +50,23 @@ from ..logger import logger, severity_map
              help="The logging level.")
@click.option("--max_beam_width",
              type=int,
-              default=BuildConfig.max_beam_width,
+              default=BuildConfig.model_fields["max_beam_width"].default,
              help="Maximum number of beams for beam search decoding.")
@click.option("--max_batch_size",
              type=int,
-              default=BuildConfig.max_batch_size,
+              default=BuildConfig.model_fields["max_batch_size"].default,
              help="Maximum number of requests that the engine can schedule.")
@click.option(
    "--max_num_tokens",
    type=int,
-    default=BuildConfig.max_num_tokens,
+    default=BuildConfig.model_fields["max_num_tokens"].default,
    help=
    "Maximum number of batched input tokens after padding is removed in each batch."
 )
@click.option(
    "--max_seq_len",
    type=int,
-    default=BuildConfig.max_seq_len,
+    default=BuildConfig.model_fields["max_seq_len"].default,
    help="Maximum total length of one request, including prompt and outputs. "
    "If unspecified, the value is deduced from the model config.")
@click.option("--tp_size", type=int, default=1, help='Tensor parallelism size.')
--- a/tensorrt_llm/commands/refit.py
+++ b/tensorrt_llm/commands/refit.py
@ -2,7 +2,6 @@
 Script that refits TRT-LLM engine(s) with weights in a TRT-LLM checkpoint.
 '''
 import argparse
 import copy
 import json
 import os
 import re
@ -57,7 +56,7 @@ def refit_engine(engine_path: str, refit_engine_dir: str, checkpoint_dir: str,
    # There are weights preprocess during optimize model.
    tik = time.time()
-    build_config = copy.deepcopy(engine_config.build_config)
+    build_config = engine_config.build_config.model_copy(deep=True)
    optimize_model_with_config(model, build_config)
    tok = time.time()
    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
--- a/tensorrt_llm/commands/serve.py
+++ b/tensorrt_llm/commands/serve.py
@ -75,25 +75,29 @@ def _signal_handler_cleanup_child(signum, frame):
    sys.exit(128 + signum)
-def get_llm_args(model: str,
+def get_llm_args(
-                 tokenizer: Optional[str] = None,
+        model: str,
-                 backend: str = "pytorch",
+        tokenizer: Optional[str] = None,
-                 max_beam_width: int = BuildConfig.max_beam_width,
+        backend: str = "pytorch",
-                 max_batch_size: int = BuildConfig.max_batch_size,
+        max_beam_width: int = BuildConfig.model_fields["max_beam_width"].
-                 max_num_tokens: int = BuildConfig.max_num_tokens,
+    default,
-                 max_seq_len: int = BuildConfig.max_seq_len,
+        max_batch_size: int = BuildConfig.model_fields["max_batch_size"].
-                 tensor_parallel_size: int = 1,
+    default,
-                 pipeline_parallel_size: int = 1,
+        max_num_tokens: int = BuildConfig.model_fields["max_num_tokens"].
-                 moe_expert_parallel_size: Optional[int] = None,
+    default,
-                 gpus_per_node: Optional[int] = None,
+        max_seq_len: int = BuildConfig.model_fields["max_seq_len"].default,
-                 free_gpu_memory_fraction: float = 0.9,
+        tensor_parallel_size: int = 1,
-                 num_postprocess_workers: int = 0,
+        pipeline_parallel_size: int = 1,
-                 trust_remote_code: bool = False,
+        moe_expert_parallel_size: Optional[int] = None,
-                 reasoning_parser: Optional[str] = None,
+        gpus_per_node: Optional[int] = None,
-                 fail_fast_on_attention_window_too_large: bool = False,
+        free_gpu_memory_fraction: float = 0.9,
-                 otlp_traces_endpoint: Optional[str] = None,
+        num_postprocess_workers: int = 0,
-                 enable_chunked_prefill: bool = False,
+        trust_remote_code: bool = False,
-                 **llm_args_extra_dict: Any):
+        reasoning_parser: Optional[str] = None,
        fail_fast_on_attention_window_too_large: bool = False,
        otlp_traces_endpoint: Optional[str] = None,
        enable_chunked_prefill: bool = False,
        **llm_args_extra_dict: Any):
    if gpus_per_node is None:
        gpus_per_node = device_count()
@ -242,23 +246,23 @@ class ChoiceWithAlias(click.Choice):
              help="The logging level.")
@click.option("--max_beam_width",
              type=int,
-              default=BuildConfig.max_beam_width,
+              default=BuildConfig.model_fields["max_beam_width"].default,
              help="Maximum number of beams for beam search decoding.")
@click.option("--max_batch_size",
              type=int,
-              default=BuildConfig.max_batch_size,
+              default=BuildConfig.model_fields["max_batch_size"].default,
              help="Maximum number of requests that the engine can schedule.")
@click.option(
    "--max_num_tokens",
    type=int,
-    default=BuildConfig.max_num_tokens,
+    default=BuildConfig.model_fields["max_num_tokens"].default,
    help=
    "Maximum number of batched input tokens after padding is removed in each batch."
 )
@click.option(
    "--max_seq_len",
    type=int,
-    default=BuildConfig.max_seq_len,
+    default=BuildConfig.model_fields["max_seq_len"].default,
    help="Maximum total length of one request, including prompt and outputs. "
    "If unspecified, the value is deduced from the model config.")
@click.option("--tp_size", type=int, default=1, help='Tensor parallelism size.')
@ -436,7 +440,7 @@ def serve(
              help="The logging level.")
@click.option("--max_batch_size",
              type=int,
-              default=BuildConfig.max_batch_size,
+              default=BuildConfig.model_fields["max_batch_size"].default,
              help="Maximum number of requests that the engine can schedule.")
@click.option(
    "--max_num_tokens",
--- a/tensorrt_llm/llmapi/build_cache.py
+++ b/tensorrt_llm/llmapi/build_cache.py
@ -104,7 +104,7 @@ class BuildCache:
        Get the build step for engine building.
        '''
        build_config_str = json.dumps(self.prune_build_config_for_cache_key(
-            build_config.to_dict()),
+            build_config.model_dump(mode="json")),
                                      sort_keys=True)
        kwargs_str = json.dumps(kwargs, sort_keys=True)
--- a/tensorrt_llm/llmapi/kv_cache_type.py
+++ b/tensorrt_llm/llmapi/kv_cache_type.py
@ -0,0 +1,50 @@
 # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from enum import Enum
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
    import tensorrt_llm.bindings as _bindings
 class KVCacheType(str, Enum):
    """Python enum wrapper for KVCacheType.
    This is a pure Python enum that mirrors the C++ KVCacheType enum exposed
    through pybind11.
    """
    CONTINUOUS = "continuous"
    PAGED = "paged"
    DISABLED = "disabled"
    @classmethod
    def _missing_(cls, value):
        """Allow case-insensitive string values to be converted to enum members."""
        if isinstance(value, str):
            for member in cls:
                if member.value.lower() == value.lower():
                    return member
        return None
    def to_cpp(self) -> '_bindings.KVCacheType':
        import tensorrt_llm.bindings as _bindings
        return getattr(_bindings.KVCacheType, self.name)
    @classmethod
    def from_cpp(cls, cpp_enum) -> 'KVCacheType':
        # C++ enum's __str__ returns "KVCacheType.PAGED", extract the name
        name = str(cpp_enum).split('.')[-1]
        return cls(name)
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@ -1,5 +1,4 @@
 import ast
 import copy
 import functools
 import json
 import math
@ -1764,17 +1763,6 @@ class BaseLlmArgs(StrictBaseModel):
        ret = cls(**kwargs)
        return ret
    def to_dict(self) -> dict:
        """Dump `LlmArgs` instance to a dict.
        Returns:
            dict: The dict that contains all fields of the `LlmArgs` instance.
        """
        model_dict = self.model_dump(mode='json')
        # TODO: the BuildConfig.to_dict and from_dict don't work well with pydantic
        model_dict['build_config'] = copy.deepcopy(self.build_config)
        return model_dict
    @staticmethod
    def _check_consistency(kwargs_dict: Dict[str, Any]) -> Dict[str, Any]:
        # max_beam_width is not included since vague behavior due to lacking the support for dynamic beam width during
@ -1919,10 +1907,6 @@ class BaseLlmArgs(StrictBaseModel):
            if self.max_input_len:
                kwargs["max_input_len"] = self.max_input_len
            self.build_config = BuildConfig(**kwargs)
        else:
            assert isinstance(
                build_config,
                BuildConfig), f"build_config is not initialized: {build_config}"
        return self
    @model_validator(mode="after")
@ -2001,7 +1985,7 @@ class BaseLlmArgs(StrictBaseModel):
        # TODO: remove the checker when manage weights support all data types
        if is_trt_llm_args and self.fast_build and (self.quant_config.quant_algo
                                                    is QuantAlgo.FP8):
-            self._update_plugin_config("manage_weights", True)
+            self.build_config.plugin_config.manage_weights = True
        if self.parallel_config.world_size == 1 and self.build_config:
            self.build_config.plugin_config.nccl_plugin = None
@ -2166,9 +2150,6 @@ class BaseLlmArgs(StrictBaseModel):
                "while LoRA prefetch is not supported")
        return self
    def _update_plugin_config(self, key: str, value: Any):
        setattr(self.build_config.plugin_config, key, value)
    def _load_config_from_engine(self, engine_dir: Path):
        engine_config = EngineConfig.from_json_file(engine_dir / "config.json")
        self._pretrained_config = engine_config.pretrained_config
@ -2271,10 +2252,8 @@ class TrtLlmArgs(BaseLlmArgs):
    fast_build: bool = Field(default=False, description="Enable fast build.")
    # BuildConfig is introduced to give users a familiar interface to configure the model building.
-    build_config: Optional[object] = Field(
+    build_config: Optional[BuildConfig] = Field(default=None,
-        default=None,
+                                                description="Build config.")
        description="Build config.",
        json_schema_extra={"type": f"Optional[{get_type_repr(BuildConfig)}]"})
    # Prompt adapter arguments
    enable_prompt_adapter: bool = Field(default=False,
@ -2405,11 +2384,10 @@ class TorchCompileConfig(StrictBaseModel):
 class TorchLlmArgs(BaseLlmArgs):
    # Just a dummy BuildConfig to allow code reuse with the TrtLlmArgs
-    build_config: Optional[object] = Field(
+    build_config: Optional[BuildConfig] = Field(
        default=None,
        description="Build config.",
        exclude_from_json=True,
        json_schema_extra={"type": f"Optional[{get_type_repr(BuildConfig)}]"},
        status="deprecated",
    )
@ -2911,10 +2889,7 @@ def update_llm_args_with_extra_dict(
    for field_name, field_type in field_mapping.items():
        if field_name in llm_args_dict:
            # Some fields need to be converted manually.
-            if field_name in [
+            if field_name in ["speculative_config", "sparse_attention_config"]:
                    "speculative_config", "build_config",
                    "sparse_attention_config"
            ]:
                llm_args_dict[field_name] = field_type.from_dict(
                    llm_args_dict[field_name])
            else:
@ -2928,6 +2903,10 @@ def update_llm_args_with_extra_dict(
    # For trtllm-bench or trtllm-serve, build_config may be passed for the PyTorch
    # backend, overwriting the knobs there since build_config always has the highest priority
    if "build_config" in llm_args:
        # Ensure build_config is a BuildConfig object, not a dict
        if isinstance(llm_args["build_config"], dict):
            llm_args["build_config"] = BuildConfig(**llm_args["build_config"])
        for key in [
                "max_batch_size",
                "max_num_tokens",
--- a/tensorrt_llm/llmapi/llm_utils.py
+++ b/tensorrt_llm/llmapi/llm_utils.py
@ -1,4 +1,3 @@
 import copy
 import json
 import os
 import shutil
@ -530,8 +529,8 @@ class ModelLoader:
        logger_debug(f"rank{mpi_rank()} begin to build engine...\n", "green")
-        # avoid the original build_config is modified, avoid the side effect
+        # avoid side effects by copying the original build_config
-        copied_build_config = copy.deepcopy(self.build_config)
+        copied_build_config = self.build_config.model_copy(deep=True)
        copied_build_config.update_kv_cache_type(self._model_info.architecture)
        assert self.model is not None, "model is loaded yet."
--- a/tensorrt_llm/models/eagle/model.py
+++ b/tensorrt_llm/models/eagle/model.py
@ -27,10 +27,10 @@ from tensorrt_llm.models.model_weights_loader import ModelWeightsLoader
 from ..._common import default_net, default_trtnet
 from ..._utils import pad_vocab_size
 from ...bindings import KVCacheType
 from ...functional import (Tensor, _create_tensor, cast, concat,
                           gather_last_token_logits, index_select, shape)
 from ...layers import AttentionParams, ColumnLinear, SpecDecodingParams
 from ...llmapi.kv_cache_type import KVCacheType
 from ...module import Module, ModuleList
 from ...plugin import TRT_LLM_PLUGIN_NAMESPACE
 from ..modeling_utils import QuantConfig
--- a/tensorrt_llm/models/generation_mixin.py
+++ b/tensorrt_llm/models/generation_mixin.py
@ -18,9 +18,9 @@ from typing import List, Optional
 import tensorrt as trt
 from ..bindings import KVCacheType
 from ..functional import Tensor
 from ..layers import MropeParams, SpecDecodingParams
 from ..llmapi.kv_cache_type import KVCacheType
 from ..mapping import Mapping
 from ..plugin import current_all_reduce_helper
--- a/tensorrt_llm/models/mllama/model.py
+++ b/tensorrt_llm/models/mllama/model.py
@ -21,7 +21,6 @@ import torch
 from tensorrt_llm._common import default_net
 from tensorrt_llm._utils import numpy_to_torch, str_dtype_to_torch
 from tensorrt_llm.bindings import KVCacheType
 from tensorrt_llm.functional import (Conditional, LayerNormPositionType,
                                     LayerNormType, MLPType,
                                     PositionEmbeddingType, Tensor, assertion,
@ -32,6 +31,7 @@ from tensorrt_llm.layers import (MLP, Attention, AttentionMaskParams,
                                 ColumnLinear, Embedding, FusedGatedMLP,
                                 GatedMLP, GroupNorm, KeyValueCacheParams,
                                 LayerNorm, LoraParams, RmsNorm)
 from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
 from tensorrt_llm.lora_helper import (LoraConfig,
                                      get_default_trtllm_modules_to_hf_modules,
                                      use_lora)
--- a/tensorrt_llm/models/modeling_utils.py
+++ b/tensorrt_llm/models/modeling_utils.py
@ -19,7 +19,6 @@ from .._common import default_net
 from .._utils import (QuantModeWrapper, get_init_params, numpy_to_torch,
                      release_gc, str_dtype_to_torch, str_dtype_to_trt,
                      trt_dtype_to_torch)
 from ..bindings import KVCacheType
 from ..bindings.executor import RuntimeDefaults
 from ..functional import (PositionEmbeddingType, Tensor, allgather, constant,
                          cp_split_plugin, gather_last_token_logits,
@ -31,6 +30,7 @@ from ..layers.attention import Attention, BertAttention
 from ..layers.linear import ColumnLinear, Linear, RowLinear
 from ..layers.lora import Dora, Lora
 from ..layers.moe import MOE, MoeOOTB
 from ..llmapi.kv_cache_type import KVCacheType
 from ..logger import logger
 from ..mapping import Mapping
 from ..module import Module, ModuleList
--- a/tensorrt_llm/models/nemotron_nas/model.py
+++ b/tensorrt_llm/models/nemotron_nas/model.py
@ -15,7 +15,6 @@
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Type, Union
 from tensorrt_llm.bindings import KVCacheType
 from tensorrt_llm.functional import (AllReduceFusionOp, AllReduceParams,
                                     AttentionMaskType, PositionEmbeddingType,
                                     Tensor, gather_last_token_logits, recv,
@ -28,6 +27,7 @@ from tensorrt_llm.layers.linear import ColumnLinear
 from tensorrt_llm.layers.lora import LoraParams
 from tensorrt_llm.layers.mlp import GatedMLP
 from tensorrt_llm.layers.normalization import RmsNorm
 from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.convert_utils import has_safetensors
 from tensorrt_llm.models.modeling_utils import DecoderModelForCausalLM
--- a/tensorrt_llm/models/redrafter/model.py
+++ b/tensorrt_llm/models/redrafter/model.py
@ -18,8 +18,8 @@ from collections import OrderedDict
 import tensorrt as trt
 from tensorrt_llm._common import default_net
 from tensorrt_llm.bindings import KVCacheType
 from tensorrt_llm.functional import Tensor, cast, categorical_sample
 from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
 from tensorrt_llm.models import LLaMAForCausalLM, QWenForCausalLM
 from tensorrt_llm.models.generation_mixin import GenerationMixin
--- a/tensorrt_llm/models/stdit/model.py
+++ b/tensorrt_llm/models/stdit/model.py
@ -31,7 +31,6 @@ from tqdm import tqdm
 import tensorrt_llm
 from tensorrt_llm._common import default_net
 from tensorrt_llm._utils import str_dtype_to_trt, trt_dtype_to_str
 from tensorrt_llm.bindings import KVCacheType
 from tensorrt_llm.functional import (ACT2FN, AttentionMaskType, LayerNormType,
                                     PositionEmbeddingType, Tensor,
                                     constant_to_tensor_)
@ -41,6 +40,7 @@ from tensorrt_llm.layers.attention import (Attention, AttentionParams,
                                           BertAttention, KeyValueCacheParams,
                                           bert_attention, layernorm_map)
 from tensorrt_llm.layers.normalization import RmsNorm
 from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.generation_mixin import GenerationMixin
 from tensorrt_llm.models.model_weights_loader import (ModelWeightsFormat,
--- a/tensorrt_llm/runtime/generation.py
+++ b/tensorrt_llm/runtime/generation.py
@ -43,8 +43,9 @@ from tensorrt_llm.runtime.redrafter_utils import *
 from .._utils import (binding_layer_type_to_str, binding_to_str_dtype,
                      pad_vocab_size, str_dtype_to_torch, torch_to_numpy,
                      trt_dtype_to_torch)
-from ..bindings import KVCacheType, ipc_nvls_allocate, ipc_nvls_free
+from ..bindings import ipc_nvls_allocate, ipc_nvls_free
 from ..layers import LanguageAdapterConfig
 from ..llmapi.kv_cache_type import KVCacheType
 from ..logger import logger
 from ..lora_manager import LoraManager
 from ..mapping import Mapping
--- a/tensorrt_llm/runtime/model_runner.py
+++ b/tensorrt_llm/runtime/model_runner.py
@ -25,9 +25,10 @@ import torch
 from .. import profiler
 from .._utils import mpi_comm, mpi_world_size, numpy_to_torch
-from ..bindings import KVCacheType, MpiComm
+from ..bindings import MpiComm
 from ..bindings.executor import Executor
 from ..builder import Engine, EngineConfig, get_engine_version
 from ..llmapi.kv_cache_type import KVCacheType
 from ..logger import logger
 from ..mapping import Mapping
 from ..quantization import QuantMode
@ -86,7 +87,9 @@ def _builder_to_model_config(config: dict) -> Tuple[ModelConfig, dict]:
    dtype = builder_config['precision']
    tp_size = builder_config['tensor_parallel']
    pp_size = builder_config.get('pipeline_parallel', 1)
-    kv_cache_type = KVCacheType.from_string(builder_config.get('kv_cache_type'))
+    kv_cache_type = builder_config.get('kv_cache_type')
    if kv_cache_type is not None:
        kv_cache_type = KVCacheType(kv_cache_type)
    world_size = tp_size * pp_size
    assert world_size == mpi_world_size(), \
        f'Engine world size ({tp_size} * {pp_size}) != Runtime world size ({mpi_world_size()})'
--- a/tensorrt_llm/runtime/model_runner_cpp.py
+++ b/tensorrt_llm/runtime/model_runner_cpp.py
@ -23,13 +23,13 @@ import torch
 from .. import profiler
 from .._utils import mpi_broadcast
-from ..bindings import (DataType, GptJsonConfig, KVCacheType, ModelConfig,
+from ..bindings import DataType, GptJsonConfig, ModelConfig, WorldConfig
                        WorldConfig)
 from ..bindings import executor as trtllm
 from ..bindings.executor import (DecodingMode, ExternalDraftTokensConfig,
                                 OrchestratorConfig, ParallelConfig)
 from ..builder import EngineConfig
 from ..layers import MropeParams
 from ..llmapi.kv_cache_type import KVCacheType
 from ..logger import logger
 from ..mapping import Mapping
 from .generation import LogitsProcessor, LoraManager
@ -248,7 +248,8 @@ class ModelRunnerCpp(ModelRunnerMixin):
            json_config = GptJsonConfig.parse_file(config_path)
            model_config = json_config.model_config
-        use_kv_cache = model_config.kv_cache_type != KVCacheType.DISABLED
+        use_kv_cache = KVCacheType.from_cpp(
            model_config.kv_cache_type) != KVCacheType.DISABLED
        if not model_config.use_cross_attention:
            assert cross_kv_cache_fraction is None, "cross_kv_cache_fraction should only be used with enc-dec models."
--- a/tests/integration/defs/accuracy/accuracy_core.py
+++ b/tests/integration/defs/accuracy/accuracy_core.py
@ -671,7 +671,8 @@ class CliFlowAccuracyTestHarness:
                f"--max_tokens_in_paged_kv_cache={max_tokens_in_paged_kv_cache}"
            ])
-        if task.MAX_INPUT_LEN + task.MAX_OUTPUT_LEN > BuildConfig.max_num_tokens:
+        if task.MAX_INPUT_LEN + task.MAX_OUTPUT_LEN > BuildConfig.model_fields[
                "max_num_tokens"].default:
            summarize_cmd.append("--enable_chunked_context")
        if self.extra_summarize_args:
--- a/tests/integration/defs/llmapi/test_llm_e2e.py
+++ b/tests/integration/defs/llmapi/test_llm_e2e.py
@ -142,14 +142,14 @@ def test_llmapi_build_command_parameters_align(llm_root, llm_venv, engine_dir,
    with open(os.path.join(engine_dir, "config.json"), "r") as f:
        engine_config = json.load(f)
-        build_cmd_cfg = BuildConfig.from_dict(
+        build_cmd_cfg = BuildConfig(
-            engine_config["build_config"]).to_dict()
+            **engine_config["build_config"]).model_dump()
    with open(os.path.join(tmpdir.name, "config.json"), "r") as f:
        llm_api_engine_cfg = json.load(f)
-        build_llmapi_cfg = BuildConfig.from_dict(
+        build_llmapi_cfg = BuildConfig(
-            llm_api_engine_cfg["build_config"]).to_dict()
+            **llm_api_engine_cfg["build_config"]).model_dump()
    assert build_cmd_cfg == build_llmapi_cfg
--- a/tests/integration/defs/perf/allowed_configs.py
+++ b/tests/integration/defs/perf/allowed_configs.py
@ -1636,7 +1636,7 @@ def get_allowed_models(benchmark_type=None):
                   if i.benchmark_type == benchmark_type)
-def get_build_config(model_name, return_dict=True) -> Union[BuildConfig]:
+def get_build_config(model_name, return_dict=True) -> Union[Dict, BuildConfig]:
    if model_name in _allowed_configs:
        cfg = _allowed_configs[model_name].build_config
        return asdict(cfg) if return_dict else cfg
--- a/tests/integration/defs/perf/build.py
+++ b/tests/integration/defs/perf/build.py
@ -255,7 +255,7 @@ def get_quant_config(quantization: str):
 def build_gpt(args):
    build_config = get_build_config(args.model)
-    build_config = BuildConfig.from_dict(build_config)
+    build_config = BuildConfig(**build_config)
    model_config = get_model_config(args.model)
    if args.force_num_layer_1:
        model_config['num_layers'] = 1
@ -1448,7 +1448,7 @@ def enc_dec_build_helper(component, build_config, model_config, args):
 def build_enc_dec(args):
    build_config = get_build_config(args.model)
-    build_config = BuildConfig.from_dict(build_config)
+    build_config = BuildConfig(**build_config)
    model_config = get_model_config(args.model)
    if args.force_num_layer_1:
        model_config['num_layers'] = 1
--- a/tests/unittest/bindings/test_bindings_ut.py
+++ b/tests/unittest/bindings/test_bindings_ut.py
@ -9,6 +9,7 @@ import torch
 from utils.runtime_defaults import assert_runtime_defaults_are_parsed_correctly
 import tensorrt_llm.bindings as _tb
 from tensorrt_llm.llmapi.kv_cache_type import KVCacheType
 from tensorrt_llm.mapping import Mapping
@ -85,6 +86,7 @@ def test_model_config():
    assert model_config.use_packed_input
    assert model_config.kv_cache_type is not None
    # Test with C++ enums directly
    for enum_val in [
            _tb.KVCacheType.CONTINUOUS, _tb.KVCacheType.PAGED,
            _tb.KVCacheType.DISABLED
@ -92,6 +94,17 @@ def test_model_config():
        model_config.kv_cache_type = enum_val
        assert model_config.kv_cache_type == enum_val
    # Test with Python enums converted to C++
    for py_enum in [
            KVCacheType.CONTINUOUS, KVCacheType.PAGED, KVCacheType.DISABLED
    ]:
        model_config.kv_cache_type = py_enum.to_cpp()
        # Verify it was set correctly by comparing with C++ enum
        assert model_config.kv_cache_type == getattr(_tb.KVCacheType,
                                                     py_enum.name)
        # Also verify round-trip conversion works
        assert KVCacheType.from_cpp(model_config.kv_cache_type) == py_enum
    assert model_config.tokens_per_block == 64
    tokens_per_block = 1024
    model_config.tokens_per_block = tokens_per_block
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@ -202,7 +202,7 @@ def test_llm_build_config():
        # read the build_config and check if the parameters are correctly saved
        engine_config = json.load(f)
-        build_config1 = BuildConfig.from_dict(engine_config["build_config"])
+        build_config1 = BuildConfig(**engine_config["build_config"])
        # Know issue: this will be converted to None after save engine for single-gpu
        build_config1.plugin_config.nccl_plugin = 'float16'
--- a/tests/unittest/llmapi/test_llm_args.py
+++ b/tests/unittest/llmapi/test_llm_args.py
@ -64,7 +64,7 @@ speculative_config:
        dict_content = self._yaml_to_dict(yaml_content)
        llm_args = TrtLlmArgs(model=llama_model_path)
-        llm_args_dict = update_llm_args_with_extra_dict(llm_args.to_dict(),
+        llm_args_dict = update_llm_args_with_extra_dict(llm_args.model_dump(),
                                                        dict_content)
        llm_args = TrtLlmArgs(**llm_args_dict)
        assert llm_args.speculative_config.max_window_size == 4
@ -80,7 +80,7 @@ pytorch_backend_config: # this is deprecated
        dict_content = self._yaml_to_dict(yaml_content)
        llm_args = TrtLlmArgs(model=llama_model_path)
-        llm_args_dict = update_llm_args_with_extra_dict(llm_args.to_dict(),
+        llm_args_dict = update_llm_args_with_extra_dict(llm_args.model_dump(),
                                                        dict_content)
        with pytest.raises(ValueError):
            llm_args = TrtLlmArgs(**llm_args_dict)
@ -96,7 +96,7 @@ build_config:
        dict_content = self._yaml_to_dict(yaml_content)
        llm_args = TrtLlmArgs(model=llama_model_path)
-        llm_args_dict = update_llm_args_with_extra_dict(llm_args.to_dict(),
+        llm_args_dict = update_llm_args_with_extra_dict(llm_args.model_dump(),
                                                        dict_content)
        llm_args = TrtLlmArgs(**llm_args_dict)
        assert llm_args.build_config.max_beam_width == 4
@ -113,7 +113,7 @@ kv_cache_config:
        dict_content = self._yaml_to_dict(yaml_content)
        llm_args = TrtLlmArgs(model=llama_model_path)
-        llm_args_dict = update_llm_args_with_extra_dict(llm_args.to_dict(),
+        llm_args_dict = update_llm_args_with_extra_dict(llm_args.model_dump(),
                                                        dict_content)
        llm_args = TrtLlmArgs(**llm_args_dict)
        assert llm_args.kv_cache_config.enable_block_reuse == True
@ -131,7 +131,7 @@ max_seq_len: 128
        dict_content = self._yaml_to_dict(yaml_content)
        llm_args = TrtLlmArgs(model=llama_model_path)
-        llm_args_dict = update_llm_args_with_extra_dict(llm_args.to_dict(),
+        llm_args_dict = update_llm_args_with_extra_dict(llm_args.model_dump(),
                                                        dict_content)
        llm_args = TrtLlmArgs(**llm_args_dict)
        assert llm_args.max_batch_size == 16
@ -331,7 +331,7 @@ def test_update_llm_args_with_extra_dict_with_nested_dict():
                               lora_config=LoraConfig(lora_ckpt_source='hf'),
                               plugin_config=plugin_config)
    extra_llm_args_dict = {
-        "build_config": build_config.to_dict(),
+        "build_config": build_config.model_dump(mode="json"),
    }
    llm_api_args_dict = update_llm_args_with_extra_dict(llm_api_args_dict,
@ -352,8 +352,9 @@ def test_update_llm_args_with_extra_dict_with_nested_dict():
                raise ValueError(f"Mismatch at {path}: {dict1} != {dict2}")
        return True
-    build_config_dict1 = build_config.to_dict()
+    build_config_dict1 = build_config.model_dump(mode="json")
-    build_config_dict2 = initialized_llm_args.build_config.to_dict()
+    build_config_dict2 = initialized_llm_args.build_config.model_dump(
        mode="json")
    check_nested_dict_equality(build_config_dict1, build_config_dict2)
@ -498,11 +499,11 @@ class TestTrtLlmArgs:
            max_num_tokens=256,
        )
        args = TrtLlmArgs(model=llama_model_path, build_config=build_config)
-        args_dict = args.to_dict()
+        args_dict = args.model_dump()
        new_args = TrtLlmArgs.from_kwargs(**args_dict)
-        assert new_args.to_dict() == args_dict
+        assert new_args.model_dump() == args_dict
    def test_build_config_from_engine(self):
        build_config = BuildConfig(max_batch_size=8, max_num_tokens=256)
@ -522,6 +523,36 @@ class TestTrtLlmArgs:
        assert args.max_num_tokens == 16
        assert args.max_batch_size == 4
    def test_model_dump_does_not_mutate_original(self):
        """Test that model_dump() and update_llm_args_with_extra_dict don't mutate the original."""
        # Create args with specific build_config values
        build_config = BuildConfig(
            max_batch_size=8,
            max_num_tokens=256,
        )
        args = TrtLlmArgs(model=llama_model_path, build_config=build_config)
        # Store original values
        original_max_batch_size = args.build_config.max_batch_size
        original_max_num_tokens = args.build_config.max_num_tokens
        # Convert to dict and pass through update_llm_args_with_extra_dict with overrides
        args_dict = args.model_dump()
        extra_dict = {
            "max_batch_size": 128,
            "max_num_tokens": 1024,
        }
        updated_dict = update_llm_args_with_extra_dict(args_dict, extra_dict)
        # Verify original args was NOT mutated
        assert args.build_config.max_batch_size == original_max_batch_size
        assert args.build_config.max_num_tokens == original_max_num_tokens
        # Verify updated dict has new values
        new_args = TrtLlmArgs(**updated_dict)
        assert new_args.build_config.max_batch_size == 128
        assert new_args.build_config.max_num_tokens == 1024
 class TestStrictBaseModelArbitraryArgs:
    """Test that StrictBaseModel prevents arbitrary arguments from being accepted."""