diff --git a/README.md b/README.md index b5247c457f..7deeac5ee7 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,6 @@ TensorRT-LLM [![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/) [![cuda](https://img.shields.io/badge/cuda-12.2-green)](https://developer.nvidia.com/cuda-downloads) [![trt](https://img.shields.io/badge/TRT-9.1-green)](https://developer.nvidia.com/tensorrt) -[![version](https://img.shields.io/badge/release-0.5.0-green)](./setup.py) [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE) [Architecture](./docs/source/architecture.md)   |   [Results](./docs/source/performance.md)   |   [Examples](./examples/)   |   [Documentation](./docs/source/) @@ -173,13 +172,13 @@ Lovelace architectures. Certain limitations may, however, apply. Various numerical precisions are supported in TensorRT-LLM. The support for some of those numerical features require specific architectures: -| | FP32 | FP16 | BF16 | FP8 | INT8 | INT4 | -| :--------------------------- | :---- | :---- | :---- | :--- | :--- | :--- | -| Volta (SM70) | Y | Y | N | N | Y | Y | -| Turing (SM75) | Y | Y | N | N | Y | Y | -| Ampere (SM80, SM86) | Y | Y | Y | N | Y | Y | -| Ada-Lovelace (SM89) | Y | Y | Y | Y | Y | Y | -| Hopper (SM90) | Y | Y | Y | Y | Y | Y | +| | FP32 | FP16 | BF16 | FP8 | INT8 | INT4 | +| :------------------ | :--- | :--- | :--- | :--- | :--- | :--- | +| Volta (SM70) | Y | Y | N | N | Y | Y | +| Turing (SM75) | Y | Y | N | N | Y | Y | +| Ampere (SM80, SM86) | Y | Y | Y | N | Y | Y | +| Ada-Lovelace (SM89) | Y | Y | Y | Y | Y | Y | +| Hopper (SM90) | Y | Y | Y | Y | Y | Y | In this release of TensorRT-LLM, the support for FP8 and quantized data types (INT8 or INT4) is not implemented for all the models. See the @@ -217,8 +216,7 @@ The list of supported models is: * [Bert](examples/bert) * [Blip2](examples/blip2) * [BLOOM](examples/bloom) -* [ChatGLM-6B](examples/chatglm6b) -* [ChatGLM2-6B](examples/chatglm2-6b/) +* [ChatGLM](examples/chatglm), including ChatGLM-6B, ChatGLM2-6B, ChatGLM2-6B-32k, ChatGLM3-6B, ChatGLM3-6B-32k * [Falcon](examples/falcon) * [GPT](examples/gpt) * [GPT-J](examples/gptj) @@ -230,6 +228,7 @@ The list of supported models is: * [OPT](examples/opt) * [SantaCoder](examples/gpt) * [StarCoder](examples/gpt) +* [InternLM](examples/internlm) ## Performance diff --git a/benchmarks/cpp/gptSessionBenchmark.cpp b/benchmarks/cpp/gptSessionBenchmark.cpp index 1b44c55c34..aff5bf0e3c 100644 --- a/benchmarks/cpp/gptSessionBenchmark.cpp +++ b/benchmarks/cpp/gptSessionBenchmark.cpp @@ -18,12 +18,12 @@ #include "tensorrt_llm/plugins/api/tllmPlugin.h" #include "tensorrt_llm/runtime/gptJsonConfig.h" #include "tensorrt_llm/runtime/gptSession.h" +#include "tensorrt_llm/runtime/memoryCounters.h" #include "tensorrt_llm/runtime/tllmLogger.h" #include #include #include -#include #include #include @@ -39,14 +39,22 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con std::shared_ptr const& logger, int warmUp, int numRuns, int duration, GptSession::Config& sessionConfig, bool cudaGraphMode) { - auto const json = GptJsonConfig::parse(dataPath / "config.json"); + + std::string modelNameHyphen = modelName; + std::filesystem::path jsonFileName = dataPath / "config.json"; + if (tc::strStartsWith(modelName, "chatglm")) + { + std::replace(modelNameHyphen.begin(), modelNameHyphen.end(), '_', '-'); + jsonFileName = dataPath / (modelNameHyphen + std::string("-config.json")); + } + auto const json = GptJsonConfig::parse(jsonFileName); auto const modelConfig = json.getModelConfig(); auto const inputPacked = modelConfig.usePackedInput(); SizeType deviceCount{0}; TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount)); auto const worldConfig = WorldConfig::mpi(*logger, deviceCount, json.getTensorParallelism(), json.getPipelineParallelism()); - auto const enginePath = dataPath / json.engineFilename(worldConfig, modelName); + auto const enginePath = dataPath / json.engineFilename(worldConfig, modelNameHyphen); auto const dtype = modelConfig.getDataType(); auto const useHalf = (dtype == nvinfer1::DataType::kHALF); @@ -78,10 +86,15 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con auto constexpr endId = 50256; auto constexpr padId = 50256; + auto& memoryCounter = MemoryCounters::getInstance(); + TLLM_LOG_INFO(memoryCounter.toString()); + for (auto const batchSize : batchSizes) { try { + TLLM_LOG_INFO(memoryCounter.toString()); + std::vector inputLenghtsHost(batchSize, maxInputLength); auto inputLenghts = bufferManager.copyFrom(inputLenghtsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU); @@ -99,6 +112,9 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con inputIds = bufferManager.copyFrom( inputsHost, ITensor::makeShape({batchSize, maxInputLength}), MemoryType::kGPU); } + + TLLM_LOG_INFO(memoryCounter.toString()); + GenerationInput generationInput{ endId, padId, std::move(inputIds), std::move(inputLenghts), inputPacked}; @@ -107,6 +123,8 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32), bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)}; + TLLM_LOG_INFO(memoryCounter.toString()); + for (auto r = 0; r < warmUp; ++r) { SizeType numSteps = 0; @@ -118,6 +136,8 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con } cudaDeviceSynchronize(); + TLLM_LOG_INFO(memoryCounter.toString()); + int iterIdx = 0; float curDuration = 0; while (iterIdx < numRuns || curDuration / 1000 < duration) @@ -134,6 +154,9 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con iterIdx += 1; curDuration += std::chrono::duration(end - start).count(); } + + TLLM_LOG_INFO(memoryCounter.toString()); + printf("Benchmarking done. Iteration: %d, duration: %.2f sec.\n", iterIdx, curDuration / 1000); if (worldConfig.getRank() == 0) @@ -159,7 +182,7 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con // We can ignore the OOM exception and continue the rest of the benchmark if (worldConfig.getRank() == 0) { - printf("%s", e.what()); + TLLM_LOG_EXCEPTION(e); printf( "[BENCHMARK] batch_size %d input_length %d output_length %d latency(ms) N/A tokensPerSec N/A\n", batchSize, maxInputLength, maxNewTokens); @@ -167,6 +190,7 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con continue; } } + TLLM_LOG_INFO(memoryCounter.toString()); } } @@ -200,8 +224,8 @@ int main(int argc, char* argv[]) options.add_options()("duration", "Minimal duration of iterations to measure in seconds.", cxxopts::value()->default_value("60")); - options.add_options()( - "num_micro_batches", "Number of micro batches if enabling pipeline parallelism.", cxxopts::value()); + options.add_options()("ctx_micro_batch_size", "Batch size for context phase.", cxxopts::value()); + options.add_options()("gen_micro_batch_size", "Batch size for generation phase.", cxxopts::value()); options.add_options()("max_tokens_in_paged_kvcache", "Max tokens in paged K-V Cache.", cxxopts::value()); options.add_options()( "kv_cache_free_gpu_mem_fraction", "K-V Cache Free Gpu Mem Fraction.", cxxopts::value()); @@ -281,10 +305,15 @@ int main(int argc, char* argv[]) } GptSession::Config sessionConfig{0, 0, 0}; - // Argument: Number of micro batches - if (result.count("num_micro_batches")) + // Argument: Batch size for context phase + if (result.count("ctx_micro_batch_size")) { - sessionConfig.numMicroBatches = result["num_micro_batches"].as(); + sessionConfig.ctxMicroBatchSize = result["ctx_micro_batch_size"].as(); + } + // Argument: Batch size for generation phase + if (result.count("gen_micro_batch_size")) + { + sessionConfig.genMicroBatchSize = result["gen_micro_batch_size"].as(); } // Argument: Max tokens in paged K-V Cache if (result.count("max_tokens_in_paged_kvcache")) diff --git a/benchmarks/python/allowed_configs.py b/benchmarks/python/allowed_configs.py index 922f955a71..5868456318 100644 --- a/benchmarks/python/allowed_configs.py +++ b/benchmarks/python/allowed_configs.py @@ -48,6 +48,7 @@ class BuildConfig(BaseModel, extra=Extra.allow): # default value to be None, not 0 or 1 to prevent misuse rotary_pct: Optional[float] = None bias: bool = True + quantization: Optional[str] = None class ModelConfig(BaseModel): @@ -121,7 +122,7 @@ _allowed_configs = { max_input_len=512, max_output_len=200, builder_opt=None, - use_smooth_quant=True, + quantization="int8_sq_per_tensor", )), "gpt_350m_sq_per_token_channel": ModelConfig(name="gpt_350m_sq_per_token_channel", @@ -138,9 +139,7 @@ _allowed_configs = { max_input_len=512, max_output_len=200, builder_opt=None, - use_smooth_quant=True, - per_token=True, - per_channel=True, + quantization="int8_sq_per_token_channel", )), "gpt-next_2b": ModelConfig(name="gpt-next_2b", @@ -318,7 +317,7 @@ _allowed_configs = { max_input_len=512, max_output_len=200, builder_opt=None, - use_smooth_quant=True)), + quantization="int8_sq_per_tensor")), "gptj_6b": ModelConfig(name="gptj_6b", family="gptj", @@ -354,7 +353,7 @@ _allowed_configs = { builder_opt=None, )), "chatglm_6b": - ModelConfig(name="chatglm_6b", + ModelConfig(name="chatglm-6b", family="chatglm", benchmark_type="gpt", build_config=BuildConfig( @@ -371,7 +370,7 @@ _allowed_configs = { remove_input_padding=False, )), "chatglm2_6b": - ModelConfig(name="chatglm2_6b", + ModelConfig(name="chatglm2-6b", family="chatglm2", benchmark_type="gpt", build_config=BuildConfig( @@ -387,6 +386,23 @@ _allowed_configs = { builder_opt=None, remove_input_padding=False, )), + "chatglm3_6b": + ModelConfig(name="chatglm3-6b", + family="chatglm3", + benchmark_type="gpt", + build_config=BuildConfig( + num_layers=28, + num_heads=32, + hidden_size=4096, + vocab_size=65024, + hidden_act='swiglu', + n_positions=2048, + max_batch_size=256, + max_input_len=512, + max_output_len=200, + builder_opt=None, + remove_input_padding=False, + )), "bloom_560m": ModelConfig(name="bloom_560m", family="bloom", diff --git a/benchmarks/python/benchmark.py b/benchmarks/python/benchmark.py index aeff8b67ce..f13f35a950 100644 --- a/benchmarks/python/benchmark.py +++ b/benchmarks/python/benchmark.py @@ -18,15 +18,11 @@ from multiprocessing import Process, Queue from time import time import torch -from allowed_configs import get_allowed_models -from bert_benchmark import BERTBenchmark -from gpt_benchmark import GPTBenchmark from mem_monitor import mem_monitor -from tensorrt_llm.logger import logger - def parse_arguments(): + from allowed_configs import get_allowed_models parser = argparse.ArgumentParser( description='Benchmark TensorRT-LLM models.') parser.add_argument('-m', @@ -172,18 +168,7 @@ def parse_arguments(): help= 'Quick sanity check with num_layer=1; will be silently ignored if --engine_dir is specified.' ) - parser.add_argument( - '--enable_fp8', - default=False, - action='store_true', - help='Use FP8 Linear layer for LMHead, Attention QKV/Dense, and MLP.') - parser.add_argument( - '--fp8_kv_cache', - default=False, - action="store_true", - help= - 'By default, we use dtype for KV cache. fp8_kv_cache chooses fp8 quantization for KV' - ) + parser.add_argument('--csv', default=False, action="store_true", @@ -199,11 +184,38 @@ def parse_arguments(): help= 'Use latency-optimized all-reduce for tensor parallelism. Gives better performance with NVLink.' ) + parser.add_argument( + '--strongly_typed', + default=False, + action='store_true', + help= + 'This option is introduced with trt 9.1.0.1+ and will reduce the building time significantly for fp8.' + ) + parser.add_argument( + '--quantization', + type=str, + default=None, + choices=[ + 'fp8', 'fp8_gemm', 'fp8_kv_cache', 'int8_sq_per_tensor', + 'int8_sq_per_token_channel', 'int8_weight_only', 'int4_weight_only', + 'int4_weight_only_awq', 'int4_weight_only_gptq' + ], + help="Optimize the model with specified quantization recipe") return parser.parse_args() def main(args): + # We import tensorrt_llm here because MPI is initialized when + # tensorrt_llm is imported, but mpi4py does not work well with + # the start method `spawn` of Python multiprocessing, + # so we set the start method first, then initialize MPI. + from allowed_configs import get_allowed_models + from bert_benchmark import BERTBenchmark + from gpt_benchmark import GPTBenchmark + + from tensorrt_llm.logger import logger + logger.set_level(args.log_level) # Batch size @@ -235,10 +247,10 @@ def main(args): args.max_output_len, args.max_batch_size, force_num_layer_1=args.force_num_layer_1, - enable_fp8=args.enable_fp8, - fp8_kv_cache=args.fp8_kv_cache, enable_cuda_graph=args.enable_cuda_graph, - enable_custom_all_reduce=args.enable_custom_all_reduce) + enable_custom_all_reduce=args.enable_custom_all_reduce, + strongly_typed=args.strongly_typed, + quantization=args.quantization) elif args.model in get_allowed_models(benchmark_type="bert"): benchmarker = BERTBenchmark(args.engine_dir, args.model, @@ -273,8 +285,8 @@ def main(args): # Launch a subprocess to monitor memory usage q1 = Queue() # q1 is used for sending signal to subprocess q2 = Queue() # q2 is used for receiving results from subprocess - p = Process(target=mem_monitor, args=(q1, q2)) - p.start() + mem_monitor_process = Process(target=mem_monitor, args=(q1, q2)) + mem_monitor_process.start() iter_idx = 0 try: @@ -301,14 +313,14 @@ def main(args): except Exception as e: print("Found exception during benchmarking", e.with_traceback()) - p.kill() + mem_monitor_process.kill() raise e logger.debug("Sending signal to mem monitor process, start") q1.put(1) logger.debug("Sending signal to mem monitor process, done") peak_gpu_used = q2.get() logger.debug("Get peak gpu memory usage from mem monitor process, done") - p.join() + mem_monitor_process.join() logger.debug("Memory monitor process joined") latency = round(sum(latencies) / iter_idx, 3) diff --git a/benchmarks/python/gpt_benchmark.py b/benchmarks/python/gpt_benchmark.py index 94f60d0dcc..88ec9f7443 100644 --- a/benchmarks/python/gpt_benchmark.py +++ b/benchmarks/python/gpt_benchmark.py @@ -24,8 +24,7 @@ import tensorrt_llm from tensorrt_llm._utils import str_dtype_to_trt from tensorrt_llm.builder import Builder from tensorrt_llm.layers import PositionEmbeddingType -from tensorrt_llm.models import (fp8_quantize, smooth_quantize, - weight_only_quantize) +from tensorrt_llm.models import quantize_model from tensorrt_llm.network import net_guard from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.quantization import QuantMode @@ -61,6 +60,7 @@ class GPTBenchmark(BaseBenchmark): self.fuse_bias = True self.cuda_graph_mode = kwargs.get('enable_cuda_graph', False) + self.strongly_typed = kwargs.get('strongly_typed', False) self.enable_custom_all_reduce = enable_custom_all_reduce if engine_dir is not None: @@ -73,12 +73,9 @@ class GPTBenchmark(BaseBenchmark): # Build engine self.world_size = tensorrt_llm.mpi_world_size() self.apply_query_key_layer_scaling = False - self.use_smooth_quant = False - # this attribute is not stored in allowed_config - self.enable_fp8 = kwargs.get('enable_fp8', False) - self.fp8_kv_cache = kwargs.get('fp8_kv_cache', False) self.use_weight_only = False + self.per_group = False self.weight_only_precision = 'int8' self.per_token = False self.per_channel = False @@ -95,12 +92,17 @@ class GPTBenchmark(BaseBenchmark): self.use_rmsnorm_plugin = False self.use_lookup_plugin = non_mha_plg_dtype self.enable_context_fmha = use_mha_plugin - self.quant_mode = QuantMode(0) + self.remove_input_padding = use_non_mha_plugin for key, value in get_build_config(model_name).items(): setattr(self, key, value) + if self.quantization is None: + self.quantization = kwargs.get('quantization', None) + + self.set_quantization() + # Override the n_position/max_input_len/max_output_len/max_batch_size to value from cmd line if that's specified. if n_positions is not None: assert isinstance( @@ -126,20 +128,6 @@ class GPTBenchmark(BaseBenchmark): self.num_kv_heads = self.num_heads if kwargs.get('force_num_layer_1', False): self.num_layers = 1 - - if self.use_smooth_quant: - self.quant_mode = QuantMode.use_smooth_quant( - self.per_token, self.per_channel) - elif self.use_weight_only: - self.quant_mode = QuantMode.use_weight_only( - self.weight_only_precision == 'int4') - - if self.enable_fp8: - self.quant_mode = self.quant_mode.set_fp8_qdq() - - if self.fp8_kv_cache: - self.quant_mode = self.quant_mode.set_fp8_kv_cache() - engine_buffer = self.build() assert engine_buffer is not None @@ -155,16 +143,25 @@ class GPTBenchmark(BaseBenchmark): quant_mode=self.quant_mode, use_custom_all_reduce=self.enable_custom_all_reduce, ) - if model_name == 'chatglm_6b': + if model_name == 'chatglm-6b': self.sampling_config = tensorrt_llm.runtime.SamplingConfig( end_id=130005, pad_id=3, num_beams=num_beams, top_k=top_k, top_p=top_p) - self.decoder = tensorrt_llm.runtime.ChatGLM6BHeadModelGenerationSession( + self.decoder = tensorrt_llm.runtime.ChatGLMGenerationSession( model_config, engine_buffer, self.runtime_mapping) - elif model_name == 'chatglm2_6b': + elif model_name == 'chatglm2-6b': + self.sampling_config = tensorrt_llm.runtime.SamplingConfig( + end_id=2, + pad_id=0, + num_beams=num_beams, + top_k=top_k, + top_p=top_p) + self.decoder = tensorrt_llm.runtime.GenerationSession( + model_config, engine_buffer, self.runtime_mapping) + elif model_name == 'chatglm3-6b': self.sampling_config = tensorrt_llm.runtime.SamplingConfig( end_id=2, pad_id=0, @@ -212,6 +209,75 @@ class GPTBenchmark(BaseBenchmark): self.decoder.setup(batch_size, inlen, outlen, beam_width=self.num_beams) return (input_ids, input_lengths) + def set_quantization(self): + self.quant_mode = QuantMode(0) + + if self.quantization == "fp8": + self.strongly_typed = True + self.quant_mode = self.quant_mode.set_fp8_qdq() + self.quant_mode = self.quant_mode.set_fp8_kv_cache() + + elif self.quantization == "fp8_gemm": + self.strongly_typed = True + self.quant_mode = self.quant_mode.set_fp8_qdq() + + elif self.quantization == "fp8_kv_cache": + self.strongly_typed = True + self.quant_mode = self.quant_mode.set_fp8_kv_cache() + + elif self.quantization == "int8_sq_per_tensor": + self.use_smooth_quant = True + self.quant_mode = QuantMode.use_smooth_quant( + self.per_token, self.per_channel) + + elif self.quantization == "int8_sq_per_token_channel": + self.use_smooth_quant = True + self.per_token = True + self.per_channel = True + self.quant_mode = QuantMode.use_smooth_quant( + self.per_token, self.per_channel) + + elif self.quantization == "int8_weight_only": + self.use_smooth_quant = False + self.use_weight_only = True + self.weight_only_precision = 'int8' + self.quant_mode = QuantMode.use_weight_only(False) + + elif self.quantization == "int4_weight_only": + self.use_weight_only = True + self.weight_only_precision = 'int4' + self.quant_mode = QuantMode.use_weight_only(True) + + elif self.quantization == "int4_weight_only_awq": + self.use_weight_only = True + self.per_group = True + self.weight_only_precision = 'int4_awq' + self.quant_mode = QuantMode.from_description( + quantize_weights=True, + quantize_activations=False, + per_token=False, + per_channel=False, + per_group=True, + use_int4_weights=True) + + elif self.quantization == "int4_weight_only_gptq": + self.use_weight_only = True + self.per_group = True + self.weight_only_precision = 'int4_gptq' + self.quant_mode = QuantMode.from_description( + quantize_weights=True, + quantize_activations=False, + per_token=False, + per_channel=False, + per_group=True, + use_int4_weights=True) + + elif self.quantization == None: + pass + + else: + raise Exception(f'{0} is invalid config: {self.quantization}') + def build(self): builder = Builder() builder_config = builder.create_builder_config( @@ -232,10 +298,10 @@ class GPTBenchmark(BaseBenchmark): max_input_len=self.max_input_len, max_output_len=self.max_output_len, int8=self.quant_mode.has_act_and_weight_quant(), - fp8=self.quant_mode.has_fp8_qdq(), quant_mode=self.quant_mode, use_refit=self.refit, - opt_level=self.builder_opt) + opt_level=self.builder_opt, + strongly_typed=self.strongly_typed) engine_name = get_engine_name(self.model_name, self.dtype, self.world_size, self.runtime_rank) @@ -322,7 +388,7 @@ class GPTBenchmark(BaseBenchmark): apply_query_key_layer_scaling=builder_config. apply_query_key_layer_scaling) elif family == "chatglm": - tensorrt_llm_model = tensorrt_llm.models.ChatGLM6BHeadModel( + tensorrt_llm_model = tensorrt_llm.models.ChatGLMHeadModel( num_layers=self.num_layers, num_heads=self.num_heads, hidden_size=self.hidden_size, @@ -335,9 +401,10 @@ class GPTBenchmark(BaseBenchmark): tp_size=self.world_size), # TP only apply_query_key_layer_scaling=builder_config. apply_query_key_layer_scaling, - quant_mode=self.quant_mode) + quant_mode=self.quant_mode, + model_version="1") elif family == "chatglm2": - tensorrt_llm_model = tensorrt_llm.models.ChatGLM2_6BHeadModel( + tensorrt_llm_model = tensorrt_llm.models.ChatGLMHeadModel( num_layers=self.num_layers, num_heads=self.num_heads, hidden_size=self.hidden_size, @@ -350,7 +417,24 @@ class GPTBenchmark(BaseBenchmark): tp_size=self.world_size), # TP only apply_query_key_layer_scaling=builder_config. apply_query_key_layer_scaling, - quant_mode=self.quant_mode) + quant_mode=self.quant_mode, + model_version="2") + elif family == "chatglm3": + tensorrt_llm_model = tensorrt_llm.models.ChatGLMHeadModel( + num_layers=self.num_layers, + num_heads=self.num_heads, + hidden_size=self.hidden_size, + vocab_size=self.vocab_size, + hidden_act=self.hidden_act, + max_position_embeddings=self.n_positions, + dtype=kv_dtype, + mapping=tensorrt_llm.Mapping( + world_size=self.world_size, + tp_size=self.world_size), # TP only + apply_query_key_layer_scaling=builder_config. + apply_query_key_layer_scaling, + quant_mode=self.quant_mode, + model_version="3") elif family == "bloom": tensorrt_llm_model = tensorrt_llm.models.BloomForCausalLM( num_layers=self.num_layers, @@ -362,6 +446,7 @@ class GPTBenchmark(BaseBenchmark): mapping=tensorrt_llm.Mapping( world_size=self.world_size, tp_size=self.world_size), # TP only + quant_mode=self.quant_mode, use_parallel_embedding=(self.model_name == 'bloom_176b')) elif family == "falcon": tensorrt_llm_model = tensorrt_llm.models.FalconForCausalLM( @@ -381,27 +466,34 @@ class GPTBenchmark(BaseBenchmark): else: raise Exception(f'Unexpected model: {self.model_name}') - if self.use_smooth_quant: - tensorrt_llm_model = smooth_quantize(tensorrt_llm_model, - self.quant_mode) - elif self.use_weight_only and self.weight_only_precision == 'int8': - tensorrt_llm_model = weight_only_quantize( - tensorrt_llm_model, QuantMode.use_weight_only()) - elif self.use_weight_only and self.weight_only_precision == 'int4': - tensorrt_llm_model = weight_only_quantize( - tensorrt_llm_model, - QuantMode.use_weight_only(use_int4_weights=True)) - elif self.enable_fp8 or self.fp8_kv_cache: - tensorrt_llm_model = fp8_quantize(tensorrt_llm_model, - self.quant_mode) + quant_kwargs = {} + if family == "llama" and self.use_weight_only: + if self.weight_only_precision == 'int4_awq': + quant_kwargs = { + "group_size": 128, + "zero": False, + "pre_quant_scale": True, + "exclude_modules": [], + } + elif self.weight_only_precision == 'int4_gptq': + quant_kwargs = { + "group_size": 128, + "zero": True, + "pre_quant_scale": False, + } + tensorrt_llm_model = quantize_model(tensorrt_llm_model, self.quant_mode, + **quant_kwargs) # Module -> Network network = builder.create_network() network.trt_network.name = engine_name + + not_fp8_quantization = self.quantization is None or "fp8" not in self.quantization + if self.use_gpt_attention_plugin: network.plugin_config.set_gpt_attention_plugin( dtype=self.use_gpt_attention_plugin) - if self.use_gemm_plugin: + if self.use_gemm_plugin and not_fp8_quantization: network.plugin_config.set_gemm_plugin(dtype=self.use_gemm_plugin) if self.use_layernorm_plugin: network.plugin_config.set_layernorm_plugin( diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d706f6e816..ad30374a11 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -27,10 +27,14 @@ project(tensorrt_llm LANGUAGES CXX) # Build options option(BUILD_PYT "Build in PyTorch TorchScript class mode" ON) +option(BUILD_PYBIND "Build Python bindings for C++ runtime and batch manager" + OFF) option(BUILD_TESTS "Build Google tests" ON) option(BUILD_BENCHMARKS "Build benchmarks" ON) option(NVTX_DISABLE "Disable all NVTX features" ON) option(WARNING_IS_ERROR "Treat all warnings as errors" OFF) +option(FAST_BUILD "Skip compiling some kernels to accelerate compiling" OFF) +option(FAST_MATH "Compiling in fast math mode" OFF) if(NVTX_DISABLE) add_compile_definitions("NVTX_DISABLE") @@ -73,6 +77,11 @@ else() message(STATUS "Not building benchmarks") endif() +if(FAST_BUILD) + add_compile_definitions("FAST_BUILD") + message(WARNING "Skip some kernels to accelerate compilation") +endif() + # Determine CUDA version before enabling the language extension check_language(CUDA) if(CMAKE_CUDA_COMPILER) @@ -229,6 +238,10 @@ endif() set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") +if(FAST_MATH) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math") + message("CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") +endif() set(COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR} ${CUDAToolkit_INCLUDE_DIR}) message(STATUS "COMMON_HEADER_DIRS: ${COMMON_HEADER_DIRS}") @@ -333,3 +346,11 @@ if(BUILD_BENCHMARKS) add_subdirectory(${TRT_LLM_ROOT_DIR}/benchmarks/cpp ${CMAKE_BINARY_DIR}/benchmarks) endif() + +# Measure the compile time +option(MEASURE_BUILD_TIME "Measure the build time of each module" OFF) +if(MEASURE_BUILD_TIME) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_COMMAND} -E time") + set_property(GLOBAL PROPERTY RULE_LAUNCH_CUSTOM "${CMAKE_COMMAND} -E time") + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "${CMAKE_COMMAND} -E time") +endif() diff --git a/cpp/include/tensorrt_llm/batch_manager/GptManager.h b/cpp/include/tensorrt_llm/batch_manager/GptManager.h index 5eedb86551..89d7994453 100644 --- a/cpp/include/tensorrt_llm/batch_manager/GptManager.h +++ b/cpp/include/tensorrt_llm/batch_manager/GptManager.h @@ -46,6 +46,7 @@ class GptManager public: using SizeType = tensorrt_llm::runtime::SizeType; using RequestList = std::list>; + using TensorPtr = runtime::ITensor::SharedPtr; GptManager(std::filesystem::path const& trtEnginePath, TrtGptModelType modelType, int32_t maxBeamWidth, batch_scheduler::SchedulerPolicy schedulerPolicy, GetInferenceRequestsCallback getInferenceRequestsCb, @@ -108,6 +109,9 @@ private: inline static const std::string kBeamWidthTensorName_ = "beam_width"; inline static const std::string kEndIdTensorName_ = "end_id"; inline static const std::string kPadIdTensorName_ = "pad_id"; + inline static const std::string kBadWordsListTensorName_ = "bad_words_list"; + inline static const std::string kStopWordsListTensorName_ = "stop_words_list"; + inline static const std::string kEmbeddingBiasTensorName_ = "embedding_bias"; inline static const std::string kTemperatureTensorName_ = "temperature"; inline static const std::string kRuntimeTopKTensorName_ = "runtime_top_k"; inline static const std::string kRuntimeTopPTensorName_ = "runtime_top_p"; @@ -116,6 +120,8 @@ private: inline static const std::string kMinLengthTensorName_ = "min_length"; inline static const std::string kPresencePenaltyTensorName_ = "presence_penalty"; inline static const std::string kRandomSeedTensorName_ = "random_seed"; + inline static const std::string kPromptEmbeddingTableName_ = "prompt_embedding_table"; + inline static const std::string kPromptVocabSizeName_ = "prompt_vocab_size"; inline static const std::string kOutputIdsTensorName_ = "output_ids"; inline static const std::string kSequenceLengthTensorName_ = "sequence_length"; diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h index 3e967aa42d..690d337ffb 100644 --- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h +++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h @@ -33,6 +33,16 @@ namespace tensorrt_llm::batch_manager::kv_cache_manager { +using SizeType = tensorrt_llm::runtime::SizeType; + +struct KvCacheStats +{ + SizeType maxNumBlocks; + SizeType freeNumBlocks; + SizeType usedNumBlocks; + SizeType toksPerBlock; +}; + // Basic building block of a paged KV cache - a single // cache block. This class just holds metadata, no pointers // since it is reused across all layers. @@ -231,6 +241,17 @@ public: return mBlockManager.getNumFreeBlocks(); } + [[nodiscard]] KvCacheStats getKvCacheStats() const + { + KvCacheStats kvCacheStats; + kvCacheStats.maxNumBlocks = getMaxNumBlocks(); + kvCacheStats.freeNumBlocks = getNumFreeBlocks(); + kvCacheStats.usedNumBlocks = getUsedNumBlocks(); + kvCacheStats.toksPerBlock = getTokensPerBlock(); + + return kvCacheStats; + } + // Volume of [2, numKvHeads, tokensPerBlock, sizePerHead] [[nodiscard]] SizeType getBlockSize() const { diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h index 2703e24f5d..c577151f5a 100644 --- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h +++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h @@ -16,6 +16,8 @@ #pragma once +#include "tensorrt_llm/runtime/bufferManager.h" +#include "tensorrt_llm/runtime/iTensor.h" #include "tensorrt_llm/runtime/samplingConfig.h" #include @@ -41,10 +43,14 @@ public: using TokenIdType = runtime::TokenIdType; using RequestIdType = std::uint64_t; using BeamTokens = std::vector>; + using TensorPtr = runtime::ITensor::SharedPtr; LlmRequest(RequestIdType requestId, SizeType maxNewTokens, std::shared_ptr> input_tokens, runtime::SamplingConfig samplingConfig, bool isStreaming, std::optional endId = std::nullopt, - std::optional padId = std::nullopt) + std::optional padId = std::nullopt, std::optional embeddingBias = std::nullopt, + std::optional badWordsList = std::nullopt, std::optional stopWordsList = std::nullopt, + std::optional promptEmbeddingTable = std::nullopt, + std::optional promptVocabSize = std::nullopt) : mRequestId(requestId) , mPromptLen(input_tokens->size()) , mMaxNewTokens(maxNewTokens) @@ -54,10 +60,25 @@ public: , mEndId(endId) , mPadId(padId) , mBatchSlot(-1) + , mEmbeddingBias(embeddingBias) + , mBadWordsList(badWordsList) + , mStopWordsList(stopWordsList) + , mPromptEmbeddingTable(promptEmbeddingTable) + , mPromptVocabSize(promptVocabSize) { mMaxSentTokenPos = mPromptLen - 1; // Scatter the input tokens to other beam mTokens = std::make_shared(mSamplingConfig.beamWidth, *input_tokens); + + if ((mPromptEmbeddingTable.has_value() && !mPromptVocabSize.has_value()) + || (!mPromptEmbeddingTable.has_value() && mPromptVocabSize.has_value())) + { + std::string errStr + = "Prompt embedding table and prompt vocab size tensors must both be provided for requests with prompt " + "tuning enabled."; + TLLM_LOG_ERROR(errStr); + throw std::runtime_error(errStr); + } } /// @brief Get total number of tokens for this req (prompt + generated) @@ -104,6 +125,14 @@ public: return getMaxBeamNumTokens() - mPromptLen; } + /// @brief Add new generated tokens to the vector of tokens + /// @param token The token to add + /// @param beam The beam to which to add the new token + void addNewToken(TokenIdType token, SizeType beam) + { + mTokens->at(beam).push_back(token); + } + /// @brief Add new generated tokens to the vector of tokens /// @param beamTokens A vector containing the tokens to add for each beam index /// beamTokens is expected to be of size beamWidth @@ -174,6 +203,46 @@ public: mMaxSentTokenPos = pos; } + std::optional getPromptEmbeddingTable() const + { + return mPromptEmbeddingTable; + } + + void movePromptEmbeddingTableToGpu(runtime::BufferManager const& manager) + { + if (!mPromptEmbeddingTable.has_value() + || mPromptEmbeddingTable.value()->getMemoryType() == runtime::MemoryType::kGPU) + { + return; + } + else + { + TensorPtr gpuPromptEmbeddingTable + = manager.copyFrom(*mPromptEmbeddingTable.value(), runtime::MemoryType::kGPU); + mPromptEmbeddingTable = gpuPromptEmbeddingTable; + } + } + + std::optional getPromptVocabSize() const + { + return mPromptVocabSize; + } + + std::optional getEmbeddingBias() const + { + return mEmbeddingBias; + } + + std::optional getBadWordsList() const + { + return mBadWordsList; + } + + std::optional getStopWordsList() const + { + return mStopWordsList; + } + RequestIdType mRequestId; SizeType mPromptLen; SizeType mMaxNewTokens; @@ -188,6 +257,13 @@ public: private: std::shared_ptr mTokens; SizeType mMaxSentTokenPos; + + std::optional mEmbeddingBias; + std::optional mBadWordsList; + std::optional mStopWordsList; + + std::optional mPromptEmbeddingTable; + std::optional mPromptVocabSize; }; } // namespace tensorrt_llm::batch_manager diff --git a/cpp/include/tensorrt_llm/runtime/generationInput.h b/cpp/include/tensorrt_llm/runtime/generationInput.h index 3343587c55..840bc247a8 100644 --- a/cpp/include/tensorrt_llm/runtime/generationInput.h +++ b/cpp/include/tensorrt_llm/runtime/generationInput.h @@ -19,6 +19,7 @@ #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/iTensor.h" +#include "tensorrt_llm/runtime/promptTuningParams.h" #include #include @@ -26,18 +27,20 @@ namespace tensorrt_llm::runtime { -class GenerationInput +template +class GenericGenerationInput { public: - using TensorPtr = ITensor::SharedPtr; + using TensorPtr = TTensor; - explicit GenerationInput( + explicit GenericGenerationInput( SizeType const endId, SizeType const padId, TensorPtr ids, TensorPtr lengths, bool packed = false) : endId{endId} , padId{padId} , ids{std::move(ids)} , lengths{std::move(lengths)} , packed{packed} + , maxNewTokens(std::nullopt) { TLLM_CHECK_WITH_INFO(static_cast(this->ids), "Invalid ids tensor"); TLLM_CHECK_WITH_INFO(static_cast(this->lengths), "Invalid lengths tensor"); @@ -55,6 +58,22 @@ public: TensorPtr badWordsList; // [2, badWordsLength] or [batchSize, 2, badWordsLength], on gpu TensorPtr stopWordsList; // [batchSize, 2, stopWordsLength], on gpu std::optional maxNewTokens; // max number of tokens to generate + + // Ptuning parameters + PromptTuningParams promptTuningParams; // See promptTuningParams.h for expected shapes +}; + +class GenerationInput : public GenericGenerationInput +{ +public: + using Base = GenericGenerationInput; + using TensorPtr = Base::TensorPtr; + + explicit GenerationInput( + SizeType const endId, SizeType const padId, TensorPtr ids, TensorPtr lengths, bool packed = false) + : GenericGenerationInput(endId, padId, std::move(ids), std::move(lengths), packed) + { + } }; } // namespace tensorrt_llm::runtime diff --git a/cpp/include/tensorrt_llm/runtime/generationOutput.h b/cpp/include/tensorrt_llm/runtime/generationOutput.h index c601066c06..33b7d7272e 100644 --- a/cpp/include/tensorrt_llm/runtime/generationOutput.h +++ b/cpp/include/tensorrt_llm/runtime/generationOutput.h @@ -26,14 +26,14 @@ namespace tensorrt_llm::runtime { -class GenerationOutput +template +class GenericGenerationOutput { public: - using TensorPtr = ITensor::SharedPtr; - + using TensorPtr = TTensor; using Callback = std::function; - explicit GenerationOutput(TensorPtr ids, TensorPtr lengths) + explicit GenericGenerationOutput(TensorPtr ids, TensorPtr lengths) : ids{std::move(ids)} , lengths{std::move(lengths)} { @@ -53,4 +53,16 @@ public: Callback onTokenGenerated; }; +class GenerationOutput : public GenericGenerationOutput +{ +public: + using Base = GenericGenerationOutput; + using TensorPtr = Base::TensorPtr; + + explicit GenerationOutput(TensorPtr ids, TensorPtr lengths) + : GenericGenerationOutput(std::move(ids), std::move(lengths)) + { + } +}; + } // namespace tensorrt_llm::runtime diff --git a/cpp/include/tensorrt_llm/runtime/gptModelConfig.h b/cpp/include/tensorrt_llm/runtime/gptModelConfig.h index c49b55e291..325a724662 100644 --- a/cpp/include/tensorrt_llm/runtime/gptModelConfig.h +++ b/cpp/include/tensorrt_llm/runtime/gptModelConfig.h @@ -29,7 +29,7 @@ public: enum class ModelVariant : std::int32_t { kGpt = 0, - kGlm = 1, // https://github.com/THUDM/GLM + kGlm = 1, // https://github.com/THUDM/GLM and https://github.com/THUDM/ChatGLM-6B }; constexpr explicit GptModelConfig( @@ -52,6 +52,7 @@ public: , mComputeContextLogits(false) , mModelVariant(ModelVariant::kGpt) , mUseCustomAllReduce(false) + , mMaxPromptEmbeddingTableSize(0) { } @@ -196,6 +197,21 @@ public: mMaxNumTokens = maxNumTokens; } + [[nodiscard]] bool constexpr usePromptTuning() const noexcept + { + return mMaxPromptEmbeddingTableSize > 0; + } + + [[nodiscard]] SizeType constexpr getMaxPromptEmbeddingTableSize() const noexcept + { + return mMaxPromptEmbeddingTableSize; + } + + void constexpr setMaxPromptEmbeddingTableSize(SizeType maxPromptEmbeddingTableSize) noexcept + { + mMaxPromptEmbeddingTableSize = maxPromptEmbeddingTableSize; + } + [[nodiscard]] bool constexpr computeContextLogits() const noexcept { return mComputeContextLogits; @@ -246,6 +262,8 @@ private: bool mComputeContextLogits; ModelVariant mModelVariant; bool mUseCustomAllReduce; + + SizeType mMaxPromptEmbeddingTableSize; }; } // namespace tensorrt_llm::runtime diff --git a/cpp/include/tensorrt_llm/runtime/gptSession.h b/cpp/include/tensorrt_llm/runtime/gptSession.h index dc603e5d21..fc490e3d95 100644 --- a/cpp/include/tensorrt_llm/runtime/gptSession.h +++ b/cpp/include/tensorrt_llm/runtime/gptSession.h @@ -53,10 +53,11 @@ namespace utils std::vector loadEngine(std::string const& enginePath); } -class TllmRuntime; +class IpcMemory; class IStatefulGptDecoder; class NcclCommunicator; class RuntimeBuffers; +class TllmRuntime; class GptSession { @@ -85,7 +86,8 @@ public: bool decoderPerRequest{false}; bool cudaGraphMode{false}; KvCacheConfig kvCacheConfig{}; - std::optional numMicroBatches = std::nullopt; + std::optional ctxMicroBatchSize = std::nullopt; + std::optional genMicroBatchSize = std::nullopt; }; GptSession(Config const& sessionConfig, GptModelConfig const& modelConfig, WorldConfig const& worldConfig, @@ -136,7 +138,7 @@ private: void setup(Config const& sessionConfig); - void createContexts(SizeType numMicroBatches, bool useCudaGraphs); + void createContexts(SizeType numBatchesCtx, SizeType numBatchesGen, bool useCudaGraphs); void createBuffers(SizeType numMicroBatches); void createDecoders(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength, nvinfer1::DataType logitsType, bool decoderPerRequest, SizeType numMicroBatches); @@ -144,6 +146,12 @@ private: SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength, KvCacheConfig const& config); void createCustomAllReduceWorkspace(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength); + void executeContextStep(std::vector const& microBatches, + std::vector const& microBatchOffsets, KvCacheManager const* kvCacheManager); + SizeType executeGenerationStep(SizeType step, std::vector const& microBatches, + std::vector const& microBatchOffsets, KvCacheManager* kvCacheManager, + std::vector& microBatchesFinished); + //! @brief Execute decoder on last PP rank, receive decoder output on other PP ranks. void decoderStepAsync(SizeType decoderStep, SizeType microBatchId); @@ -156,11 +164,11 @@ private: void kvCacheAddSequences(SizeType beamWidth, SizeType microBatchId, SizeType firstBatchIdx); - ITensor::SharedPtr initNewTokens( - GenerationInput const& inputs, SamplingConfig const& samplingConfig, SizeType microBatchId); + //! @brief Populate outputIds and return reference to newTokens tensor + ITensor::SharedPtr initDecoder(ITensor& outputIds, GenerationInput const& inputs, + SamplingConfig const& samplingConfig, SizeType microBatchId) const; - std::function createOnTokenGeneratedCallback( - GenerationOutput& outputs, SizeType numMicroBatches); + std::function createOnTokenGeneratedCallback(GenerationOutput& outputs); class CudaGraphExecutor { @@ -196,6 +204,45 @@ private: cudaGraphExec_t mInstance; }; + class MicroBatchConfig + { + public: + MicroBatchConfig() + : numCtxBatches{1} + , numGenBatches{1} + , ctxBatchSize{0} + , genBatchSize{0} + { + } + + explicit MicroBatchConfig(SizeType maxBatchSize, SizeType pipelineParallelism, + std::optional genMicroBatchSize, std::optional ctxMicroBatchSize); + + constexpr SizeType numCtxPerGen() const + { + return numCtxBatches / numGenBatches; + } + + //! @details First 2 * numGenBatches contexts are for generation phase, next numCtxBatches are for context + //! phase. Use numCtxPerGen() contexts for the context batches of each generation batch. + constexpr SizeType getCtxContextId(SizeType generationBatchId, SizeType contextBatchId) const + { + return 2 * numGenBatches + generationBatchId * numCtxPerGen() + contextBatchId; + } + + //! @details First 2 * numGenBatches contexts are for generation phase, flip-flop between 2 of them for each + //! generation batch. + constexpr SizeType getGenContextId(SizeType flipFlopId, SizeType generationBatchId) const + { + return flipFlopId * numGenBatches + generationBatchId; + } + + SizeType numCtxBatches; + SizeType numGenBatches; + SizeType ctxBatchSize; + SizeType genBatchSize; + }; + friend class batch_manager::TrtGptModelV1; private: @@ -206,13 +253,17 @@ private: std::shared_ptr mCommStream; CudaEvent mCommEvent{}; + // tensor parallelism with custom allreduce plugin + ITensor::SharedPtr mCommPtrs; + std::vector> mIpcMemoryHandles; + SizeType mDecoderMaxSequenceLength{}; LoggerPtr mLogger; std::shared_ptr mRuntime; std::shared_ptr mKvCacheManager; - SizeType mNumMicroBatches; + MicroBatchConfig mMicroBatchConfig; // for each micro batch std::vector> mDecoders; std::vector> mBuffers; diff --git a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatch.h b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatch.h index 270058a6de..667c1e58f1 100644 --- a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatch.h +++ b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatch.h @@ -35,9 +35,10 @@ namespace decoder_batch class Request { public: - using TensorPtr = std::shared_ptr; + using ConstTensorPtr = std::shared_ptr; + using TensorPtr = std::shared_ptr; - explicit Request(TensorPtr ids, std::optional maxNewTokens = std::nullopt, + explicit Request(ConstTensorPtr ids, std::optional maxNewTokens = std::nullopt, std::optional endId = std::nullopt, std::optional padId = std::nullopt) : ids{std::move(ids)} , maxNewTokens{maxNewTokens} @@ -46,7 +47,7 @@ public: } // mandatory parameters - TensorPtr ids; // [inputSeqLen], the input sequence of token ids, on gpu + ConstTensorPtr ids; // [inputSeqLen], the input sequence of token ids, on gpu // optional parameters std::optional maxNewTokens; // maximum number of tokens to generate for this request diff --git a/cpp/include/tensorrt_llm/runtime/iTensor.h b/cpp/include/tensorrt_llm/runtime/iTensor.h index 931d58a361..a5847b4ef2 100644 --- a/cpp/include/tensorrt_llm/runtime/iTensor.h +++ b/cpp/include/tensorrt_llm/runtime/iTensor.h @@ -114,6 +114,25 @@ public: return newDims; } + //! + //! \brief Add a *unit* dimension to `shape` at the specified position. + //! + //! \param shape The shape to unsqueeze. + //! \param dim The dimension where unit dimension should be added. + //! \return A new shape with the added unit dimension. + //! + static Shape unsqueeze(Shape const& shape, SizeType dim) + { + TLLM_CHECK_WITH_INFO(dim <= shape.nbDims && dim >= 0, + common::fmtstr("Invalid dim %d, tensor has %d dimensions", dim, shape.nbDims)); + + Shape newDims{shape.nbDims + 1}; + std::copy(shape.d, shape.d + dim, newDims.d); + newDims.d[dim] = 1; + std::copy(shape.d + dim, shape.d + shape.nbDims, newDims.d + dim + 1); + return newDims; + } + //! //! \brief Removes the given *unit* dimensions from this tensor. //! @@ -122,6 +141,14 @@ public: reshape(squeeze(getShape(), dim)); } + //! + //! \brief Adds a *unit* dimension at the specified position + //! + void unsqueeze(SizeType dim) + { + reshape(unsqueeze(getShape(), dim)); + } + //! //! \brief Creates a sliced view on the underlying `tensor`. The view will have the same data type as `tensor`. //! diff --git a/cpp/include/tensorrt_llm/runtime/memoryCounters.h b/cpp/include/tensorrt_llm/runtime/memoryCounters.h index 503591301d..ff68a80a95 100644 --- a/cpp/include/tensorrt_llm/runtime/memoryCounters.h +++ b/cpp/include/tensorrt_llm/runtime/memoryCounters.h @@ -127,6 +127,8 @@ public: static std::string bytesToString(DiffType bytes, int precision = 2); + std::string toString() const; + private: SizeType mGpu{}, mCpu{}, mPinned{}; DiffType mGpuDiff{}, mCpuDiff{}, mPinnedDiff{}; diff --git a/cpp/include/tensorrt_llm/runtime/promptTuningParams.h b/cpp/include/tensorrt_llm/runtime/promptTuningParams.h new file mode 100644 index 0000000000..3690165f53 --- /dev/null +++ b/cpp/include/tensorrt_llm/runtime/promptTuningParams.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/runtime/bufferManager.h" +#include "tensorrt_llm/runtime/common.h" +#include "tensorrt_llm/runtime/iTensor.h" +#include "tensorrt_llm/runtime/tllmBuffers.h" + +#include +#include + +namespace tensorrt_llm::runtime +{ + +template +class GenericPromptTuningParams +{ +public: + using TensorPtr = TTensor; + using SizeType = tensorrt_llm::runtime::SizeType; + + explicit GenericPromptTuningParams( + TensorPtr embeddingTable = TensorPtr(), TensorPtr tasks = TensorPtr(), TensorPtr vocabSize = TensorPtr()) + : embeddingTable{std::move(embeddingTable)} + , tasks{std::move(tasks)} + , vocabSize{std::move(vocabSize)} {}; + + // The prompt embedding table + TensorPtr embeddingTable; // [numTasks * taskVocabSize, hidden_dim], on gpu + // In GenerationInput, tasks expected shape is [batchSize] + // For context requests with non-packed inputs, expected shape is [batchSize, 1] + // For generation requests with non-packed inputs, expected shape is [batchSize*beamWidth] for generation requests. + // For packed inputs, expected shape is [1, packedLength] (note that ifb currently doesn't support non-packed + // inputs) + TensorPtr tasks; + TensorPtr vocabSize; // [1], on gpu + + std::vector + promptTuningEnabled; // [batchSize] vector of bool that indicates which requests in a batch have ptuning enabled +}; + +class PromptTuningParams : public GenericPromptTuningParams +{ +public: + using TensorPtr = ITensor::SharedPtr; + using SizeType = GenericPromptTuningParams::SizeType; + + explicit PromptTuningParams( + TensorPtr embeddingTable = nullptr, TensorPtr tasks = nullptr, TensorPtr vocabSize = nullptr) + : GenericPromptTuningParams(std::move(embeddingTable), std::move(tasks), std::move(vocabSize)) + { + } + + // Fill the tasks tensor for the batch using the provided tasksHost + // Function assumes that the first numContextRequests requests in the batch are context requests + void fillTasksTensor(TensorPtr tasksHost, const SizeType batchSize, const SizeType numContextRequests, + const std::vector& reqBeamWidths, const std::vector& reqPromptLengths, + BufferManager& manager, bool packedInput); +}; + +} // namespace tensorrt_llm::runtime diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt index d17d0cf415..2b37960333 100644 --- a/cpp/tensorrt_llm/CMakeLists.txt +++ b/cpp/tensorrt_llm/CMakeLists.txt @@ -70,29 +70,40 @@ if(NOT WIN32) # Linux endif() else() # Windows # AMD64, IA64, ARM64, EM64T, X86 - set(BATCH_MANAGER_TARGET_ARCH "${CMAKE_SYSTEM_PROCESSOR}-WINDOWS") - string(TOLOWER ${BATCH_MANAGER_TARGET_ARCH} ${BATCH_MANAGER_TARGET_ARCH}) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64") + set(BATCH_MANAGER_TARGET_ARCH "x86_64-windows-msvc") + else() + message( + FATAL_ERROR + "The system processor type is unsupported: ${CMAKE_SYSTEM_PROCESSOR}") + endif() endif() if(BUILD_BATCH_MANAGER) add_subdirectory(batch_manager) else() add_library(${BATCH_MANAGER_TARGET} STATIC IMPORTED) - execute_process( - COMMAND ${Python3_EXECUTABLE} "-c" - "import torch; print(torch.compiled_with_cxx11_abi(),end='');" - RESULT_VARIABLE _PYTHON_SUCCESS - OUTPUT_VARIABLE USE_CXX11_ABI) + if(NOT WIN32) # Linux + execute_process( + COMMAND ${Python3_EXECUTABLE} "-c" + "import torch; print(torch.compiled_with_cxx11_abi(),end='');" + RESULT_VARIABLE _PYTHON_SUCCESS + OUTPUT_VARIABLE USE_CXX11_ABI) - message(STATUS "USE_CXX11_ABI: ${USE_CXX11_ABI}") + message(STATUS "USE_CXX11_ABI: ${USE_CXX11_ABI}") - if(USE_CXX11_ABI) + if(USE_CXX11_ABI) + set(BATCH_MANAGER_LIB_LOC + "${CMAKE_CURRENT_SOURCE_DIR}/batch_manager/${BATCH_MANAGER_TARGET_ARCH}/libtensorrt_llm_batch_manager_static.a" + ) + else() + set(BATCH_MANAGER_LIB_LOC + "${CMAKE_CURRENT_SOURCE_DIR}/batch_manager/${BATCH_MANAGER_TARGET_ARCH}/libtensorrt_llm_batch_manager_static.pre_cxx11.a" + ) + endif() + else() # Windows set(BATCH_MANAGER_LIB_LOC - "${CMAKE_CURRENT_SOURCE_DIR}/batch_manager/${BATCH_MANAGER_TARGET_ARCH}/libtensorrt_llm_batch_manager_static.a" - ) - else() - set(BATCH_MANAGER_LIB_LOC - "${CMAKE_CURRENT_SOURCE_DIR}/batch_manager/${BATCH_MANAGER_TARGET_ARCH}/libtensorrt_llm_batch_manager_static.pre_cxx11.a" + "${CMAKE_CURRENT_SOURCE_DIR}/batch_manager/${BATCH_MANAGER_TARGET_ARCH}/tensorrt_llm_batch_manager_static.lib" ) endif() set_property(TARGET ${BATCH_MANAGER_TARGET} PROPERTY IMPORTED_LOCATION @@ -132,7 +143,7 @@ set_target_properties( CXX_EXTENSIONS "NO") if(NOT MSVC) # Unix-like compilers - set(ALLOW_UNDEFINED_FLAG "-Wl, --no-undefined") + set(UNDEFINED_FLAG "-Wl,--no-undefined") else() # MSVC set(UNDEFINED_FLAG "") endif() @@ -158,4 +169,8 @@ if(BUILD_PYT) add_subdirectory(thop) endif() +if(BUILD_PYBIND) + add_subdirectory(pybind) +endif() + add_subdirectory(plugins) diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a index 789ccaf67a..6131fa3c33 100644 --- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a +++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:422df71fccde81a55049fb61996d0b88bbaf1f18866b63c8e73c36b772c2df46 -size 1508332 +oid sha256:f591dd181613b14f7ded3ba3e167d14073564254bc46db8c4bd9636d6d896b16 +size 1611436 diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a index 65af1acc24..138428aad7 100644 --- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0013625bc6b18255f44d6ab38e8ea0bceda6452bddf9df3cf832ad106fc2058d -size 1516676 +oid sha256:21d17a9fa736d033ad77270a0fbcdd09c27dfab3f871d92a5ffa0cb744fa48fd +size 1623126 diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt index 0d5881fb58..8b007588a9 100644 --- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt @@ -1,3 +1,3 @@ -bda56cf4ad2242be25115ddecd23e7df libtensorrt_llm_batch_manager_static.a -12d7c8e5b4a018dfd9043fa7db979b5a libtensorrt_llm_batch_manager_static.pre_cxx11.a -7e492cc1057b1091f62d69df81547cb071729e5d commit +e1dc326c0c45864b9e7963b4d92d322f libtensorrt_llm_batch_manager_static.a +d2e9d76efe6b4173270aa6b494dfe59c libtensorrt_llm_batch_manager_static.pre_cxx11.a +07363ea7a6fdd6eeedc1670dedeeaedff7f9a848 commit diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a index 59f7f8d234..f30db7d141 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a +++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5a207480594cb228b7264f28af85b0a820046f64379f11fd7389c701ca5497d -size 1421186 +oid sha256:3fe444bf079ce35262b932302806b372ccb677182969e3bba45698343e5e350f +size 1523444 diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a index 9b57e350db..130b4932c5 100644 --- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a +++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80e06e15b9e29ba80c036ba6604a2ce286acb294eddb50015bad53cfdeba4534 -size 1423958 +oid sha256:99641389fdf26f6324b7465df0b61b74946787a6a147d145de23b444261e6e5f +size 1524188 diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt new file mode 100644 index 0000000000..7bf2950986 --- /dev/null +++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt @@ -0,0 +1,2 @@ +b10b0e00d0132b04969d779af45d73d0 libtensorrt_llm_batch_manager_static.a +3ad06255afdaa8450c133d1d1bc486c4 libtensorrt_llm_batch_manager_static.pre_cxx11.a diff --git a/cpp/tensorrt_llm/common/assert.cpp b/cpp/tensorrt_llm/common/assert.cpp new file mode 100755 index 0000000000..2f3f780313 --- /dev/null +++ b/cpp/tensorrt_llm/common/assert.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "assert.h" + +bool CHECK_DEBUG_ENABLED = false; + +namespace +{ + +#if !defined(_MSC_VER) +__attribute__((constructor)) +#endif +void initOnLoad() +{ + auto constexpr kDebugEnabled = "TRT_LLM_DEBUG_MODE"; + auto const debugEnabled = std::getenv(kDebugEnabled); + if (debugEnabled && debugEnabled[0] == '1') + { + CHECK_DEBUG_ENABLED = true; + } +} +} // namespace diff --git a/cpp/tensorrt_llm/common/assert.h b/cpp/tensorrt_llm/common/assert.h index 1c4bca699b..8d6daa5cee 100644 --- a/cpp/tensorrt_llm/common/assert.h +++ b/cpp/tensorrt_llm/common/assert.h @@ -30,6 +30,8 @@ namespace tensorrt_llm::common } // namespace tensorrt_llm::common +extern bool CHECK_DEBUG_ENABLED; + #if defined(_WIN32) #define TLLM_LIKELY(x) (__assume((x) == 1), (x)) #else @@ -50,6 +52,26 @@ namespace tensorrt_llm::common : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, info); \ } while (0) +#define TLLM_CHECK_DEBUG(val) \ + do \ + { \ + if (CHECK_DEBUG_ENABLED) \ + { \ + TLLM_LIKELY(static_cast(val)) ? ((void) 0) \ + : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \ + } \ + } while (0) + +#define TLLM_CHECK_DEBUG_WITH_INFO(val, info) \ + do \ + { \ + if (CHECK_DEBUG_ENABLED) \ + { \ + TLLM_LIKELY(static_cast(val)) ? ((void) 0) \ + : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, info); \ + } \ + } while (0) + #define TLLM_THROW(...) \ do \ { \ diff --git a/cpp/tensorrt_llm/common/cudaUtils.h b/cpp/tensorrt_llm/common/cudaUtils.h index 4393039ea6..2669a06de8 100644 --- a/cpp/tensorrt_llm/common/cudaUtils.h +++ b/cpp/tensorrt_llm/common/cudaUtils.h @@ -390,6 +390,17 @@ void print2dToScreen(const T* result, const int r, const int c, const int stride print2dToStream(result, r, c, stride, stdout); } +template +void print2dToFile(std::string fname, const T* result, const int r, const int c, const int stride) +{ + FILE* fp = fopen(fname.c_str(), "wt"); + if (fp != nullptr) + { + print2dToStream(result, r, c, stride, fp); + fclose(fp); + } +} + inline void print_float_(float x) { printf("%7.3f ", x); diff --git a/cpp/tensorrt_llm/common/quantization.h b/cpp/tensorrt_llm/common/quantization.h index 4ea5fc8327..0e4f8e9f55 100644 --- a/cpp/tensorrt_llm/common/quantization.h +++ b/cpp/tensorrt_llm/common/quantization.h @@ -201,7 +201,7 @@ public: return quantMode; } - constexpr QuantMode operator+(const QuantMode& other) noexcept + constexpr QuantMode operator+(const QuantMode& other) const noexcept { return QuantMode(mValue | other.mValue); } @@ -211,7 +211,7 @@ public: return *this = *this + other; } - constexpr QuantMode operator-(const QuantMode& other) noexcept + constexpr QuantMode operator-(const QuantMode& other) const noexcept { return QuantMode(mValue & ~other.mValue); } diff --git a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh index f609874ed0..1e924fb1ca 100644 --- a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh +++ b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh @@ -296,6 +296,11 @@ struct TopK __device__ __forceinline__ void insert(T elem, int elem_id) { + if (elem_id < 0) + { + return; + } + if (elem > u[MAX_K - 1] || (p[MAX_K - 1] == -1) || ((elem == u[MAX_K - 1]) && (elem_id < p[MAX_K - 1]))) // if (elem > u[MAX_K-1] || ((elem == u[MAX_K-1]) && (elem_id < p[MAX_K-1]))) { diff --git a/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.cu b/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.cu index 49e4e4be90..5ca0f47ac1 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.cu @@ -171,10 +171,17 @@ template void invokeAddBiasApplyPenalties(T* logits, const int** output_ids_ptr, const int** parent_ids_ptr, const int* input_lengths, const int* sequence_lengths, const T* bias, const int ite, const int local_batch_size, const int batch_size, const int beam_width, const int vocab_size, const int vocab_size_padded, const int* end_ids, - const float* temperatures, const float* repetition_penalties, const RepetitionPenaltyType repetition_penalty_type, + const float* temperatures, const std::vector& h_temperatures, const float* repetition_penalties, + const std::vector& h_repetition_penalties, const RepetitionPenaltyType repetition_penalty_type, const int* min_lengths, const int max_seq_len, cudaStream_t stream) { - if (bias != nullptr || temperatures != nullptr || vocab_size != vocab_size_padded) + +#define ALL_OF(p_, sz_, dt_, v_) (std::all_of(p_, p_ + sz_, [&](dt_ b) { return b == v_; })) + + if (bias != nullptr + || (temperatures != nullptr + && !ALL_OF(std::begin(h_temperatures) + ite * local_batch_size, local_batch_size, float, 1.0f)) + || vocab_size != vocab_size_padded) { dim3 block(512); if (std::is_same::value && vocab_size % 2 == 0 && vocab_size_padded % 2 == 0) @@ -199,14 +206,19 @@ void invokeAddBiasApplyPenalties(T* logits, const int** output_ids_ptr, const in size_t smem_size = (sizeof(T) * max_seq_len + 31) / 32 * 32 + sizeof(int) * max_seq_len; dim3 block(256); dim3 grid(beam_width * local_batch_size); - if (repetition_penalty_type == RepetitionPenaltyType::Multiplicative) + float default_value = getDefaultPenaltyValue(repetition_penalty_type); + if (repetition_penalty_type == RepetitionPenaltyType::Multiplicative + && !ALL_OF(std::begin(h_repetition_penalties) + ite * local_batch_size, local_batch_size, float, + default_value)) { apply_repetition_penalty<<>>(logits, batch_size, beam_width, vocab_size, vocab_size_padded, output_ids_ptr, parent_ids_ptr, input_lengths, sequence_lengths, repetition_penalties, max_seq_len); sync_check_cuda_error(); } - else if (repetition_penalty_type == RepetitionPenaltyType::Additive) + else if (repetition_penalty_type == RepetitionPenaltyType::Additive + && !ALL_OF(std::begin(h_repetition_penalties) + ite * local_batch_size, local_batch_size, float, + default_value)) { apply_repetition_penalty<<>>(logits, batch_size, beam_width, vocab_size, vocab_size_padded, output_ids_ptr, parent_ids_ptr, input_lengths, sequence_lengths, @@ -224,18 +236,22 @@ void invokeAddBiasApplyPenalties(T* logits, const int** output_ids_ptr, const in apply_min_length_penalty<<>>( logits, min_lengths, end_ids, sequence_lengths, input_lengths, beam_width, vocab_size_padded); sync_check_cuda_error(); + +#undef ALL_OF } template void invokeAddBiasApplyPenalties(float* logits, const int** output_ids_ptr, const int** parent_ids_ptr, const int* input_lengths, const int* sequence_lengths, const float* bias, const int ite, const int local_batch_size, const int batch_size, const int beam_width, const int vocab_size, const int vocab_size_padded, const int* end_ids, - const float* temperatures, const float* repetition_penalties, const RepetitionPenaltyType repetition_penalty_type, + const float* temperatures, const std::vector& h_temperatures, const float* repetition_penalties, + const std::vector& h_repetition_penalties, const RepetitionPenaltyType repetition_penalty_type, const int* min_lengths, int max_seq_len, cudaStream_t stream); template void invokeAddBiasApplyPenalties(half* logits, const int** output_ids_ptr, const int** parent_ids_ptr, const int* input_lengths, const int* sequence_lengths, const half* bias, const int ite, const int local_batch_size, const int batch_size, const int beam_width, const int vocab_size, const int vocab_size_padded, const int* end_ids, - const float* temperatures, const float* repetition_penalties, const RepetitionPenaltyType repetition_penalty_type, + const float* temperatures, const std::vector& h_temperatures, const float* repetition_penalties, + const std::vector& h_repetition_penalties, const RepetitionPenaltyType repetition_penalty_type, const int* min_lengths, int max_seq_len, cudaStream_t stream); } // namespace kernels diff --git a/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.h b/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.h index 07818b261d..4ff57c69da 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.h +++ b/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.h @@ -28,7 +28,8 @@ template void invokeAddBiasApplyPenalties(T* logits, const int** output_ids_ptr, const int** parent_ids_ptr, const int* input_lengths, const int* sequence_lengths, const T* bias, const int ite, const int local_batch_size, const int batch_size, const int beam_width, const int vocab_size, const int vocab_size_padded, const int* end_ids, - const float* temperatures, const float* repetition_penalties, const RepetitionPenaltyType repetition_penalty_type, + const float* temperatures, const std::vector& h_temperatures, const float* repetition_penalties, + const std::vector& h_repetition_penalties, const RepetitionPenaltyType repetition_penalty_type, const int* min_lengths, int max_seq_len, cudaStream_t stream); } // namespace kernels diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.cu index 89d08b4419..a6acfd1688 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.cu @@ -47,16 +47,20 @@ void multihead_attention_( switch (params.hidden_size_per_head) { case 32: mmha::mmha_launch_kernel(params, kv_cache_buffer, stream); break; - case 48: mmha::mmha_launch_kernel(params, kv_cache_buffer, stream); break; case 64: mmha::mmha_launch_kernel(params, kv_cache_buffer, stream); break; + case 128: + mmha::mmha_launch_kernel(params, kv_cache_buffer, stream); + break; + case 256: + mmha::mmha_launch_kernel(params, kv_cache_buffer, stream); + break; +#ifndef FAST_BUILD // skip mmha 48, 80, 96, 112, 144, 160, 192 and 224 for fast build + case 48: mmha::mmha_launch_kernel(params, kv_cache_buffer, stream); break; case 80: mmha::mmha_launch_kernel(params, kv_cache_buffer, stream); break; case 96: mmha::mmha_launch_kernel(params, kv_cache_buffer, stream); break; case 112: mmha::mmha_launch_kernel(params, kv_cache_buffer, stream); break; - case 128: - mmha::mmha_launch_kernel(params, kv_cache_buffer, stream); - break; case 144: mmha::mmha_launch_kernel(params, kv_cache_buffer, stream); break; @@ -69,9 +73,7 @@ void multihead_attention_( case 224: mmha::mmha_launch_kernel(params, kv_cache_buffer, stream); break; - case 256: - mmha::mmha_launch_kernel(params, kv_cache_buffer, stream); - break; +#endif // FAST_BUILD default: TLLM_THROW("unsupported head_size"); } } diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_bf16.cu index f4d9fdb4da..9ccc70152a 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_bf16.cu @@ -29,9 +29,11 @@ auto constexpr kSizePerHead = 112; namespace mmha { +#ifndef FAST_BUILD // skip mmha_112 for fast build #ifdef ENABLE_BF16 INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) -#endif +#endif // ENABLE_BF16 +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_float.cu index d5e786018f..a83d80ab07 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_float.cu @@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 112; namespace mmha { +#ifndef FAST_BUILD // skip mmha_112 for fast build INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_half.cu index ec6264bef6..04dd8beec4 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_half.cu @@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 112; namespace mmha { +#ifndef FAST_BUILD // skip mmha_112 for fast build INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_bf16.cu index 53b0603e95..7a7bfce529 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_bf16.cu @@ -29,9 +29,11 @@ auto constexpr kSizePerHead = 144; namespace mmha { +#ifndef FAST_BUILD // skip mmha_144 for fast build #ifdef ENABLE_BF16 INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) -#endif +#endif // ENABLE_BF16 +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_float.cu index af281bc36b..784814d5bc 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_float.cu @@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 144; namespace mmha { +#ifndef FAST_BUILD // skip mmha_144 for fast build INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_half.cu index bc67ddcc9f..fa3c1763e5 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_half.cu @@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 144; namespace mmha { +#ifndef FAST_BUILD // skip mmha_144 for fast build INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_bf16.cu index 365daad49b..899a52ae60 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_bf16.cu @@ -29,8 +29,10 @@ auto constexpr kSizePerHead = 160; namespace mmha { +#ifndef FAST_BUILD // skip mmha_160 for fast build #ifdef ENABLE_BF16 INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) +#endif // ENABLE_BF16 #endif } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_float.cu index a8ae5fc91b..83b038cd88 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_float.cu @@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 160; namespace mmha { +#ifndef FAST_BUILD // skip mmha_160 for fast build INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_half.cu index d2811d9c88..7cd1b5f00c 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_half.cu @@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 160; namespace mmha { +#ifndef FAST_BUILD // skip mmha_160 for fast build INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_bf16.cu index 314ea1feab..0e8c8b2dde 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_bf16.cu @@ -29,9 +29,11 @@ auto constexpr kSizePerHead = 192; namespace mmha { +#ifndef FAST_BUILD // skip mmha_192 for fast build #ifdef ENABLE_BF16 INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) -#endif +#endif // ENABLE_BF16 +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_float.cu index f3df3bc4fd..9f36f82b3f 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_float.cu @@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 192; namespace mmha { +#ifndef FAST_BUILD // skip mmha_192 for fast build INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_half.cu index c7fe1874c0..11dc61cd74 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_half.cu @@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 192; namespace mmha { +#ifndef FAST_BUILD // skip mmha_192 for fast build INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_bf16.cu index c8fc0179c6..174605c808 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_bf16.cu @@ -29,9 +29,11 @@ auto constexpr kSizePerHead = 224; namespace mmha { +#ifndef FAST_BUILD // skip mmha_224 for fast build #ifdef ENABLE_BF16 INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) -#endif +#endif // ENABLE_BF16 +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_float.cu index c22677e513..04c116fb17 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_float.cu @@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 224; namespace mmha { +#ifndef FAST_BUILD // skip mmha_224 for fast build INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_half.cu index b20cd5420b..4bbf980190 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_half.cu @@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 224; namespace mmha { +#ifndef FAST_BUILD // skip mmha_224 for fast build INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_bf16.cu index 8cdeb7ea2b..17516ec639 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_bf16.cu @@ -29,9 +29,11 @@ auto constexpr kSizePerHead = 48; namespace mmha { +#ifndef FAST_BUILD // skip mmha_48 for fast build #ifdef ENABLE_BF16 INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) -#endif +#endif // ENABLE_BF16 +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_float.cu index 8311b6a152..cc4201dd6d 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_float.cu @@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 48; namespace mmha { +#ifndef FAST_BUILD // skip mmha_48 for fast build INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_half.cu index d7b90c15bd..4b0cf08c6c 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_half.cu @@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 48; namespace mmha { +#ifndef FAST_BUILD // skip mmha_48 for fast build INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_bf16.cu index f9f386b3e4..3e02da34bc 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_bf16.cu @@ -29,9 +29,11 @@ auto constexpr kSizePerHead = 80; namespace mmha { +#ifndef FAST_BUILD // skip mmha_80 for fast build #ifdef ENABLE_BF16 INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) -#endif +#endif // ENABLE_BF16 +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_float.cu index 2759d38aea..4bbe57249e 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_float.cu @@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 80; namespace mmha { +#ifndef FAST_BUILD // skip mmha_80 for fast build INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_half.cu index 7891ebefe8..7eae60d985 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_half.cu @@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 80; namespace mmha { +#ifndef FAST_BUILD // skip mmha_80 for fast build INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_bf16.cu index 80bbd43f68..1d28e415e9 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_bf16.cu @@ -29,9 +29,11 @@ auto constexpr kSizePerHead = 96; namespace mmha { +#ifndef FAST_BUILD // skip mmha_96 for fast build #ifdef ENABLE_BF16 INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) -#endif +#endif // ENABLE_BF16 +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_float.cu index a1d7c1fddf..771b644d1d 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_float.cu @@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 96; namespace mmha { +#ifndef FAST_BUILD // skip mmha_96 for fast build INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_half.cu index be94d17088..40060ac728 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_half.cu @@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 96; namespace mmha { +#ifndef FAST_BUILD // skip mmha_96 for fast build INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) +#endif // FAST_BUILD } // namespace mmha diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h index db2de627a4..ab7db6ff23 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h @@ -2152,7 +2152,6 @@ __global__ void masked_multihead_attention_kernel( const int normlization_loop_end = MULTI_BLOCK_FLAG ? timesteps_per_block : tlength; for (int ti = tidx; ti <= normlization_loop_end; ti += THREADS_PER_BLOCK) { - const int time_now = MULTI_BLOCK_FLAG ? ti + c_tile_times_timesteps_per_block : ti; if (!MULTI_BLOCK_FLAG) @@ -2308,8 +2307,11 @@ __global__ void masked_multihead_attention_kernel( } } + // Get the c_tile_id that handles the current timestep. + const int ctile_idx = tlength / timesteps_per_block; + // One group of threads computes the product(s) for the current timestep. - if (vo == tlength % V_PER_ITER && is_valid_vi && (!MULTI_BLOCK_FLAG || (c_tile == gridDim.z - 1))) + if (vo == tlength % V_PER_ITER && is_valid_vi && (!MULTI_BLOCK_FLAG || (c_tile == ctile_idx))) { const int tokenIdx = tlength; const int inBlockIdx = kvCacheBuffer.getKVLocalIdx(tokenIdx, hi_kv, Dh, vi); @@ -2396,7 +2398,6 @@ __global__ void masked_multihead_attention_kernel( } #endif // MMHA_USE_FP32_ACCUM_FOR_LOGITS } - // Make sure we can start writing to shared memory. __syncthreads(); @@ -2428,7 +2429,7 @@ __global__ void masked_multihead_attention_kernel( } const auto bhi = tensorrt_llm::common::flat_index2(batch_beam_idx, hi, num_heads); - const auto bhi_seq_len_tile = bhi * params.max_seq_len_tile; + const auto bhi_seq_len_tile = bhi * params.seq_len_tile; // Output the final values. if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { @@ -2499,9 +2500,7 @@ __global__ void masked_multihead_attention_kernel( float final_max = -FLT_MAX; float thread_partial_max = -FLT_MAX; - if (tidx < gridDim.z) - thread_partial_max = params.partial_max[bhi_seq_len_tile + tidx]; - // final_max = fmaxf(final_max, thread_partial_max); + thread_partial_max = params.partial_max[bhi_seq_len_tile + min(tidx, gridDim.x - 1)]; // Make sure we can start writing to shared memory. __syncthreads(); @@ -2548,34 +2547,29 @@ __global__ void masked_multihead_attention_kernel( // Shared memory to store partial outputs for each oi. -> size: gridDim.z * Dh * 4 Bytes. Reuse qk_smem. T* out_oi_smem = reinterpret_cast(smem_); - // Number of threads to utilize: THREADS_PER_VALUE * gridDim.z (THREADS_PER_VALUE for vectorized output - // and gridDim.z for all the partial outputs) - int threads_boundary = THREADS_PER_VALUE * gridDim.z; // should be smaller than THREADS_PER_BLOCK - assert(threads_boundary <= THREADS_PER_BLOCK); - const auto o_idx = chunk_index(tidx); // The partial output region this thread takes care of const auto oo = o_idx.x; // The hidden dimensions computed by this particular thread. (refer to vi) const auto oi = o_idx.y; + // Within the bound. + const bool within_bound = oo < gridDim.z; + // Load partial output int thread_partial_out_offset = oo * params.batch_size * num_heads * params.hidden_size_per_head; // Load partial max (different to thread_partial_max since the threadIdx rule changes here) - float thread_partial_max_for_out = params.partial_max[bhi_seq_len_tile + oo]; + float thread_partial_max_for_out = within_bound ? params.partial_max[bhi_seq_len_tile + oo] : final_max; // Load the partial outputs. - V_vec_k thread_partial_out - = *reinterpret_cast(¶ms.partial_out[thread_partial_out_offset + bhi * Dh + oi]); - - if (tidx >= threads_boundary) - { - zero(thread_partial_out); - } + V_vec_k zero_k; + zero(zero_k); + V_vec_k thread_partial_out = within_bound + ? *reinterpret_cast(¶ms.partial_out[thread_partial_out_offset + bhi * Dh + oi]) + : zero_k; Tk factor_compute; convert_from_float(&factor_compute, __expf(thread_partial_max_for_out - final_max)); - thread_partial_out = mul(factor_compute, thread_partial_out); // Make sure we can start writing to shared memory. @@ -2620,7 +2614,6 @@ __global__ void masked_multihead_attention_kernel( convert_from_float(&inv_sum_compute, inv_sum); thread_partial_out = mul(inv_sum_compute, thread_partial_out); - *reinterpret_cast(¶ms.out[bhi * Dh + oi]) = thread_partial_out; } diff --git a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels.cu b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels.cu index 94c901b92c..1ba07ca0ba 100644 --- a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels.cu +++ b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels.cu @@ -52,18 +52,22 @@ void invokeTopkSoftMax(const T* log_probs, const T* bias, const bool* finished, switch (log_beam_width) { // 0 < beam_width <= 4 - case 0: // 1, 2 - case 1: // 3, 4 + case 0: // 1, 2 + case 1: // 3, 4 CASE_K(4) - case 2: // 4 < beam_width <= 8 + case 2: // 4 < beam_width <= 8 CASE_K(8) - case 3: // 9 < beam_width <= 16 +#ifndef FAST_BUILD // For fast build, skip case 3, 4, 5 + case 3: // 9 < beam_width <= 16 CASE_K(16) - case 4: // 16 < beam_width <= 32 + case 4: // 16 < beam_width <= 32 CASE_K(32) - case 5: // 32 < beam_width <= 64 + case 5: // 32 < beam_width <= 64 CASE_K(64) - default: throw std::runtime_error(fmtstr("Topk kernel of beam search does not support beam_width=%d", beam_width)); +#endif // FAST_BUILD + default: + throw std::runtime_error( + fmtstr("%s:%d Topk kernel of beam search does not support beam_width=%d", __FILE__, __LINE__, beam_width)); } } diff --git a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels16.cu b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels16.cu index d1a0aa72a0..f77f9f1e4f 100644 --- a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels16.cu +++ b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels16.cu @@ -20,9 +20,9 @@ namespace tensorrt_llm { namespace kernels { - +#ifndef FAST_BUILD // skip beam_width between [?, 16] for fast build INSTANTIATE_BEAMSEARCH_K(float, 16); INSTANTIATE_BEAMSEARCH_K(half, 16); - +#endif // FAST_BUILD } // namespace kernels } // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels32.cu b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels32.cu index e6f44a126e..6b1dbd5a10 100644 --- a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels32.cu +++ b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels32.cu @@ -21,8 +21,10 @@ namespace tensorrt_llm namespace kernels { +#ifndef FAST_BUILD // skip beam_width between [?, 32] for fast build INSTANTIATE_BEAMSEARCH_K(float, 32); INSTANTIATE_BEAMSEARCH_K(half, 32); +#endif // FAST_BUILD } // namespace kernels } // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels64.cu b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels64.cu index 7c424d5503..e4c70ee4f4 100644 --- a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels64.cu +++ b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels64.cu @@ -21,8 +21,10 @@ namespace tensorrt_llm namespace kernels { +#ifndef FAST_BUILD // skip beam_width between [?, 64] for fast build INSTANTIATE_BEAMSEARCH_K(float, 64); INSTANTIATE_BEAMSEARCH_K(half, 64); +#endif // FAST_BUILD } // namespace kernels } // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernelsTemplate.h b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernelsTemplate.h index fec129aa86..fe4ae9307e 100644 --- a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernelsTemplate.h +++ b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernelsTemplate.h @@ -181,16 +181,11 @@ __launch_bounds__(THREADBLOCK_SIZE) __global__ for (int i = 0; i < MAX_K; ++i) { - if (beam_hyps.num_beams != nullptr && x[total.p[i]] % vocab_size == beam_hyps.end_ids[vector_id]) + if (i < K && beam_hyps.num_beams != nullptr && x[total.p[i]] % vocab_size == beam_hyps.end_ids[vector_id]) { // if beam_token does not belong to top num_beams tokens, it should not // be added. Refer from // https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/generation_beam_search.py#L257 - if (i >= K) - { - // do nothing - } - else { const float normed_score = (float) total.u[i]; const int num_beam = beam_hyps.num_beams[global_batch_idx]; diff --git a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu index 3286af8de8..3b68220468 100644 --- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu +++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu @@ -274,7 +274,11 @@ __global__ void topKStage2Sampling(const int* __restrict topKTmpIdBuf, T* topKTm randNum = randNum - expLogit; if (randNum <= 0.0f || i == k - 1) { - ids[batchId][sequenceLengths[batchId]] = topKTmpIdBuf[batchId * stride + s_id[i]] % vocabSize; + int idx = s_id[i]; + // If s_id is -1 here we force output token to the last from vocabulary to get vivid indicator of smth + // going wrong for the debug + auto outputId = idx != -1 ? topKTmpIdBuf[batchId * stride + idx] % vocabSize : vocabSize - 1; + ids[batchId][sequenceLengths[batchId]] = outputId; if (cumLogProbs != nullptr || outputLogProbs != nullptr) { float logProb = logf(expLogit); diff --git a/cpp/tensorrt_llm/kernels/samplingTopKKernels.h b/cpp/tensorrt_llm/kernels/samplingTopKKernels.h index 3223fd9b0e..a251760428 100644 --- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.h +++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.h @@ -27,6 +27,7 @@ namespace kernels //! Computes sequenceLength, finished state, cumLogProbs inplace. //! Sampling per request can be controlled using skipDecode, topPs and topKs parameters. //! Function sets workspaceSize and exits early if workspace is nullptr. +//! If logits are Nan, we set output token to be the last in the vocabulary. //! //! \param workspace pointer to the workspace. Has to be pre-allocated by caller. Function does not take ownership of the //! buffer. diff --git a/cpp/tensorrt_llm/layers/baseBeamSearchLayer.cu b/cpp/tensorrt_llm/layers/baseBeamSearchLayer.cu index 850d43a7b8..13df179342 100644 --- a/cpp/tensorrt_llm/layers/baseBeamSearchLayer.cu +++ b/cpp/tensorrt_llm/layers/baseBeamSearchLayer.cu @@ -190,7 +190,8 @@ void BaseBeamSearchLayer::forward(BeamSearchOutputParams& outputs, ForwardPar invokeAddBiasApplyPenalties(logits.getPtr(), output_ids_ptr.template getPtr(), outputs.parent_ids_ptr.template getPtr(), input_lengths, sequence_length, embedding_bias, ite, local_batch_size, batch_size, beam_width, vocab_size_, vocab_size_padded_, end_ids, temperature_buf_, - repetition_penalty_buf_, mRepetitionPenaltyType, min_lengths_buf_, max_seq_len, stream_); + mTemperature, repetition_penalty_buf_, mRepetitionPenalty, mRepetitionPenaltyType, min_lengths_buf_, + max_seq_len, stream_); sync_check_cuda_error(); invokeSoftMax(outputs, params); diff --git a/cpp/tensorrt_llm/pybind/CMakeLists.txt b/cpp/tensorrt_llm/pybind/CMakeLists.txt new file mode 100644 index 0000000000..209dab837a --- /dev/null +++ b/cpp/tensorrt_llm/pybind/CMakeLists.txt @@ -0,0 +1,41 @@ +set(TRTLLM_PYBIND_MODULE bindings) +set(TRTLLM_PYBIND_MODULE + ${TRTLLM_PYBIND_MODULE} + PARENT_SCOPE) + +if(NOT BUILD_PYT) + message( + FATAL_ERROR + "Python bindings for C++ runtime require PyTorch. Please enable BUILD_PYT" + ) +endif() + +execute_process( + COMMAND ${Python3_EXECUTABLE} "-c" + "import pybind11 as pb11; print(pb11.get_cmake_dir(),end='');" + RESULT_VARIABLE PYBIND_CMAKE_DIR_RET + OUTPUT_VARIABLE PYBIND_CMAKE_DIR) + +if(PYBIND_CMAKE_DIR_RET MATCHES 0) + list(APPEND CMAKE_PREFIX_PATH "${PYBIND_CMAKE_DIR}") +else() + message(ERROR "pybind11 CMake directory not found.") +endif() + +find_package(pybind11 REQUIRED) + +set(SRCS bindings.cpp runtime/generationInput.cpp runtime/generationOutput.cpp) + +pybind11_add_module(${TRTLLM_PYBIND_MODULE} ${SRCS}) + +set_property(TARGET ${TRTLLM_PYBIND_MODULE} PROPERTY POSITION_INDEPENDENT_CODE + ON) + +target_link_directories(${TRTLLM_PYBIND_MODULE} PUBLIC + "${TORCH_INSTALL_PREFIX}/lib") +target_link_libraries( + ${TRTLLM_PYBIND_MODULE} + PUBLIC ${STATIC_TARGET} ${Python3_LIBRARIES} ${TORCH_LIBRARIES} torch_python + ${UNDEFINED_FLAG}) +target_compile_definitions(${TRTLLM_PYBIND_MODULE} + PUBLIC TRTLLM_PYBIND_MODULE=${TRTLLM_PYBIND_MODULE}) diff --git a/cpp/tensorrt_llm/pybind/bindings.cpp b/cpp/tensorrt_llm/pybind/bindings.cpp new file mode 100644 index 0000000000..a1060d8976 --- /dev/null +++ b/cpp/tensorrt_llm/pybind/bindings.cpp @@ -0,0 +1,250 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "runtime/generationInput.h" +#include "runtime/generationOutput.h" + +#include "tensorrt_llm/batch_manager/kvCacheConfig.h" +#include "tensorrt_llm/common/quantization.h" +#include "tensorrt_llm/runtime/common.h" +#include "tensorrt_llm/runtime/gptJsonConfig.h" +#include "tensorrt_llm/runtime/gptSession.h" +#include "tensorrt_llm/runtime/samplingConfig.h" + +namespace py = pybind11; +namespace tb = tensorrt_llm::batch_manager; +namespace tc = tensorrt_llm::common; +namespace tr = tensorrt_llm::runtime; +namespace tpr = tensorrt_llm::pybind::runtime; + +#if not defined(TRTLLM_PYBIND_MODULE) +#error "TRTLLM_PYBIND_MODULE must be defined" +#endif + +PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m) +{ + m.doc() = "TensorRT-LLM Python bindings for C++ runtime"; + + py::class_(m, "PromptTuningParams") + .def(py::init(), + py::arg("embedding_table") = py::none(), py::arg("tasks") = py::none(), py::arg("vocab_size") = py::none()) + .def_readwrite("embedding_table", &tpr::PromptTuningParams::embeddingTable) + .def_readwrite("tasks", &tpr::PromptTuningParams::tasks) + .def_readwrite("vocab_size", &tpr::PromptTuningParams::vocabSize) + .def_readwrite("prompt_tuning_enabled", &tpr::PromptTuningParams::promptTuningEnabled); + + py::class_(m, "GenerationInput") + .def(py::init(), + py::arg("end_id"), py::arg("pad_id"), py::arg("ids"), py::arg("lengths"), py::arg("packed") = false) + .def_readwrite("end_id", &tpr::GenerationInput::endId) + .def_readwrite("pad_id", &tpr::GenerationInput::padId) + .def_readwrite("ids", &tpr::GenerationInput::ids) + .def_readwrite("lengths", &tpr::GenerationInput::lengths) + .def_readwrite("packed", &tpr::GenerationInput::packed) + .def_readwrite("embedding_bias", &tpr::GenerationInput::embeddingBiasOpt) + .def_readwrite("bad_words_list", &tpr::GenerationInput::badWordsList) + .def_readwrite("stop_words_list", &tpr::GenerationInput::stopWordsList) + .def_readwrite("max_new_tokens", &tpr::GenerationInput::maxNewTokens) + .def_readwrite("prompt_tuning_params", &tpr::GenerationInput::promptTuningParams); + + py::class_(m, "GenerationOutput") + .def(py::init(), py::arg("ids"), + py::arg("lengths")) + .def_readwrite("ids", &tpr::GenerationOutput::ids) + .def_readwrite("lengths", &tpr::GenerationOutput::lengths) + .def_readwrite("log_probs", &tpr::GenerationOutput::logProbs) + .def_readwrite("context_logits", &tpr::GenerationOutput::contextLogits); + + py::class_(m, "KvCacheConfig") + .def(py::init, std::optional>(), py::arg("max_tokens") = py::none(), + py::arg("free_gpu_memory_fraction") = py::none()) + .def_readwrite("max_tokens", &tb::kv_cache_manager::KvCacheConfig::maxTokens) + .def_readwrite("free_gpu_memory_fraction", &tb::kv_cache_manager::KvCacheConfig::freeGpuMemoryFraction); + + py::class_(m, "GptSessionConfig") + .def(py::init(), py::arg("max_batch_size"), py::arg("max_beam_width"), + py::arg("max_sequence_length")) + .def_readwrite("max_batch_size", &tr::GptSession::Config::maxBatchSize) + .def_readwrite("max_beam_width", &tr::GptSession::Config::maxBeamWidth) + .def_readwrite("max_sequence_length", &tr::GptSession::Config::maxSequenceLength) + .def_readwrite("decoder_per_request", &tr::GptSession::Config::decoderPerRequest) + .def_readwrite("cuda_graph_mode", &tr::GptSession::Config::cudaGraphMode) + .def_readwrite("ctx_micro_batch_size", &tr::GptSession::Config::ctxMicroBatchSize) + .def_readwrite("gen_micro_batch_size", &tr::GptSession::Config::genMicroBatchSize) + .def_readwrite("kv_cache_config", &tr::GptSession::Config::kvCacheConfig); + + py::enum_(m, "DataType") + .value("FLOAT", nvinfer1::DataType::kFLOAT) + .value("HALF", nvinfer1::DataType::kHALF) + .value("INT8", nvinfer1::DataType::kINT8) + .value("INT32", nvinfer1::DataType::kINT32) + .value("BOOL", nvinfer1::DataType::kBOOL) + .value("UINT8", nvinfer1::DataType::kUINT8) + .value("FP8", nvinfer1::DataType::kFP8) + .value("BF16", nvinfer1::DataType::kBF16) + .value("INT64", nvinfer1::DataType::kINT64) + .export_values(); + + py::enum_(m, "GptModelVariant") + .value("GPT", tr::GptModelConfig::ModelVariant::kGpt) + .value("GLM", tr::GptModelConfig::ModelVariant::kGlm); + + py::class_(m, "QuantMode") + .def_static("none", &tc::QuantMode::none) + .def_static("int4_weights", &tc::QuantMode::int4Weights) + .def_static("int8_weights", &tc::QuantMode::int8Weights) + .def_static("activations", &tc::QuantMode::activations) + .def_static("per_channel_scaling", &tc::QuantMode::perChannelScaling) + .def_static("per_token_scaling", &tc::QuantMode::perTokenScaling) + .def_static("per_group_scaling", &tc::QuantMode::perGroupScaling) + .def_static("int8_kv_cache", &tc::QuantMode::int8KvCache) + .def_static("fp8_kv_cache", &tc::QuantMode::fp8KvCache) + .def_static("fp8_qdq", &tc::QuantMode::fp8Qdq) + .def_property_readonly("value", &tc::QuantMode::value) + .def("is_set", &tc::QuantMode::isSet, py::arg("mode")) + .def_property_readonly("has_int4_weights", &tc::QuantMode::hasInt4Weights) + .def_property_readonly("has_int8_weights", &tc::QuantMode::hasInt8Weights) + .def_property_readonly("has_activations", &tc::QuantMode::hasActivations) + .def_property_readonly("has_per_channel_scaling", &tc::QuantMode::hasPerChannelScaling) + .def_property_readonly("has_per_token_scaling", &tc::QuantMode::hasPerTokenScaling) + .def_property_readonly("has_per_group_scaling", &tc::QuantMode::hasPerGroupScaling) + .def_property_readonly("has_static_activation_scaling", &tc::QuantMode::hasStaticActivationScaling) + .def_property_readonly("has_int8_kv_cache", &tc::QuantMode::hasInt8KvCache) + .def_property_readonly("has_fp8_kv_cache", &tc::QuantMode::hasFp8KvCache) + .def_property_readonly("has_fp8_qdq", &tc::QuantMode::hasFp8Qdq) + .def_property_readonly("has_kv_cache_quant", &tc::QuantMode::hasKvCacheQuant) + .def_static("from_description", &tc::QuantMode::fromDescription, py::arg("quantize_weights") = false, + py::arg("quantize_activations") = false, py::arg("per_token") = false, py::arg("per_channel") = false, + py::arg("use_int4_weights") = false, py::arg("use_int8_kv_cache") = false, + py::arg("use_fp8_kv_kache") = false, py::arg("use_fp8_qdq") = false) + .def(py::self + py::self) + .def(py::self += py::self) + .def(py::self - py::self) + .def(py::self -= py::self) + .def(py::self == py::self) + .def(py::self != py::self); + + py::class_(m, "GptModelConfig") + .def(py::init(), + py::arg("vocab_size"), py::arg("num_layers"), py::arg("num_heads"), py::arg("hidden_size"), + py::arg("data_type")) + .def_property_readonly("vocab_size", &tr::GptModelConfig::getVocabSize) + .def("vocab_size_padded", &tr::GptModelConfig::getVocabSizePadded, py::arg("world_size")) + .def("num_layers", &tr::GptModelConfig::getNbLayers, py::arg("pipeline_parallelism") = 1) + .def_property_readonly("num_heads", &tr::GptModelConfig::getNbHeads) + .def_property_readonly("hidden_size", &tr::GptModelConfig::getHiddenSize) + .def_property_readonly("size_per_head", &tr::GptModelConfig::getSizePerHead) + .def_property_readonly("data_type", &tr::GptModelConfig::getDataType) + .def_property("num_kv_heads", &tr::GptModelConfig::getNbKvHeads, &tr::GptModelConfig::setNbKvHeads) + .def_property("use_gpt_attention_plugin", + py::overload_cast<>(&tr::GptModelConfig::useGptAttentionPlugin, py::const_), + py::overload_cast(&tr::GptModelConfig::useGptAttentionPlugin)) + .def_property("use_packed_input", py::overload_cast<>(&tr::GptModelConfig::usePackedInput, py::const_), + py::overload_cast(&tr::GptModelConfig::usePackedInput)) + .def_property("use_paged_kv_cache", py::overload_cast<>(&tr::GptModelConfig::usePagedKvCache, py::const_), + py::overload_cast(&tr::GptModelConfig::usePagedKvCache)) + .def_property( + "tokens_per_block", &tr::GptModelConfig::getTokensPerBlock, &tr::GptModelConfig::setTokensPerBlock) + .def_property("quant_mode", &tr::GptModelConfig::getQuantMode, &tr::GptModelConfig::setQuantMode) + .def_property_readonly("supports_inflight_batching", &tr::GptModelConfig::supportsInflightBatching) + .def_property("max_batch_size", &tr::GptModelConfig::getMaxBatchSize, &tr::GptModelConfig::setMaxBatchSize) + .def_property("max_input_len", &tr::GptModelConfig::getMaxInputLen, &tr::GptModelConfig::setMaxInputLen) + .def_property("max_output_len", &tr::GptModelConfig::getMaxOutputLen, &tr::GptModelConfig::setMaxOutputLen) + .def_property("max_num_tokens", &tr::GptModelConfig::getMaxNumTokens, &tr::GptModelConfig::setMaxNumTokens) + .def_property("compute_context_logits", + py::overload_cast<>(&tr::GptModelConfig::computeContextLogits, py::const_), + py::overload_cast(&tr::GptModelConfig::computeContextLogits)) + .def_property("model_variant", &tr::GptModelConfig::getModelVariant, &tr::GptModelConfig::setModelVariant) + .def_property("use_custom_all_reduce", py::overload_cast<>(&tr::GptModelConfig::useCustomAllReduce, py::const_), + py::overload_cast(&tr::GptModelConfig::useCustomAllReduce)); + + py::class_(m, "WorldConfig") + .def(py::init(), py::arg("tensor_parallelism") = 1, + py::arg("pipeline_parallelism") = 1, py::arg("rank") = 0, + py::arg("gpus_per_node") = tr::WorldConfig::kDefaultGpusPerNode) + .def_property_readonly("size", &tr::WorldConfig::getSize) + .def_property_readonly("tensor_parallelism", &tr::WorldConfig::getTensorParallelism) + .def_property_readonly("pipeline_parallelism", &tr::WorldConfig::getPipelineParallelism) + .def_property_readonly("is_tensor_parallel", &tr::WorldConfig::isTensorParallel) + .def_property_readonly("is_pipeline_parallel", &tr::WorldConfig::isPipelineParallel) + .def_property_readonly("rank", &tr::WorldConfig::getRank) + .def_property_readonly("gpus_per_node", &tr::WorldConfig::getGpusPerNode) + .def_property_readonly("device", &tr::WorldConfig::getDevice) + .def_property_readonly("pipeline_parallel_rank", &tr::WorldConfig::getPipelineParallelRank) + .def_property_readonly("tensor_parallel_rank", &tr::WorldConfig::getTensorParallelRank) + .def_static("mpi", + py::overload_cast, std::optional>( + &tr::WorldConfig::mpi), + py::arg("gpus_per_node") = tr::WorldConfig::kDefaultGpusPerNode, py::arg("tensor_parallelism") = py::none(), + py::arg("pipeline_parallelism") = py::none()); + + py::class_(m, "SamplingConfig") + .def(py::init(), py::arg("beam_width") = 1) + .def_readwrite("beam_width", &tr::SamplingConfig::beamWidth) + .def_readwrite("temperature", &tr::SamplingConfig::temperature) + .def_readwrite("min_length", &tr::SamplingConfig::minLength) + .def_readwrite("repetition_penalty", &tr::SamplingConfig::repetitionPenalty) + .def_readwrite("presence_penalty", &tr::SamplingConfig::presencePenalty) + .def_readwrite("top_k", &tr::SamplingConfig::topK) + .def_readwrite("top_p", &tr::SamplingConfig::topP) + .def_readwrite("random_seed", &tr::SamplingConfig::randomSeed) + .def_readwrite("top_p_decay", &tr::SamplingConfig::topPDecay) + .def_readwrite("top_p_min", &tr::SamplingConfig::topPMin) + .def_readwrite("top_p_reset_ids", &tr::SamplingConfig::topPResetIds) + .def_readwrite("beam_search_diversity_rate", &tr::SamplingConfig::beamSearchDiversityRate) + .def_readwrite("length_penalty", &tr::SamplingConfig::lengthPenalty); + + py::class_(m, "GptJsonConfig") + .def(py::init(), py::arg("name"), + py::arg("precision"), py::arg("tensor_parallelism"), py::arg("pipeline_parallelism"), + py::arg("model_config")) + .def_static("parse", py::overload_cast(&tr::GptJsonConfig::parse), py::arg("json")) + .def_static( + "parse_file", [](std::string const& file) { return tr::GptJsonConfig::parse(std::filesystem::path(file)); }, + py::arg("file")) + .def_property_readonly("model_config", &tr::GptJsonConfig::getModelConfig) + .def_property_readonly("name", &tr::GptJsonConfig::getName) + .def_property_readonly("precision", &tr::GptJsonConfig::getPrecision) + .def_property_readonly("tensor_parallelism", &tr::GptJsonConfig::getTensorParallelism) + .def_property_readonly("pipeline_parallelism", &tr::GptJsonConfig::getPipelineParallelism) + .def_property_readonly("world_size", &tr::GptJsonConfig::getWorldSize) + .def("engine_filename", + py::overload_cast( + &tr::GptJsonConfig::engineFilename, py::const_), + py::arg("world_config"), py::arg("model")) + .def("engine_filename", + py::overload_cast(&tr::GptJsonConfig::engineFilename, py::const_), + py::arg("world_config")); + + py::class_(m, "GptSession") + .def(py::init(), py::arg("config"), + py::arg("model_config"), py::arg("world_config"), py::arg("engine_file")) + .def_property_readonly("model_config", &tr::GptSession::getModelConfig) + .def_property_readonly("world_config", &tr::GptSession::getWorldConfig) + .def_property_readonly("device", &tr::GptSession::getDevice) + .def( + "generate", + [](tr::GptSession& self, tpr::GenerationOutput& outputs, tpr::GenerationInput const& inputs, + tr::SamplingConfig const& samplingConfig) + { self.generate(*outputs.toTrtLlm(), *inputs.toTrtLlm(), samplingConfig); }, + py::arg("outputs"), py::arg("inputs"), py::arg("sampling_config")); +} diff --git a/cpp/tensorrt_llm/pybind/runtime/generationInput.cpp b/cpp/tensorrt_llm/pybind/runtime/generationInput.cpp new file mode 100644 index 0000000000..bef4ee167f --- /dev/null +++ b/cpp/tensorrt_llm/pybind/runtime/generationInput.cpp @@ -0,0 +1,54 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "generationInput.h" + +#include "tensorrt_llm/runtime/generationInput.h" +#include "tensorrt_llm/runtime/torchView.h" + +namespace tr = tensorrt_llm::runtime; + +using namespace tensorrt_llm::pybind::runtime; + +std::shared_ptr PromptTuningParams::toTrtLlm() const +{ + auto ptt = std::make_shared(); + if (embeddingTable) + ptt->embeddingTable = tr::TorchView::of(embeddingTable.value()); + if (tasks) + ptt->tasks = tr::TorchView::of(tasks.value()); + if (vocabSize) + ptt->vocabSize = tr::TorchView::of(vocabSize.value()); + ptt->promptTuningEnabled = promptTuningEnabled; + return ptt; +} + +std::shared_ptr GenerationInput::toTrtLlm() const +{ + auto input = std::make_shared( + endId, padId, tr::TorchView::of(ids.value()), tr::TorchView::of(lengths.value()), packed); + if (embeddingBiasOpt) + input->embeddingBiasOpt = tr::TorchView::of(embeddingBiasOpt.value()); + if (badWordsList) + input->badWordsList = tr::TorchView::of(badWordsList.value()); + if (stopWordsList) + input->stopWordsList = tr::TorchView::of(stopWordsList.value()); + input->maxNewTokens = maxNewTokens; + input->promptTuningParams = *promptTuningParams.toTrtLlm(); + return input; + + return input; +} diff --git a/cpp/tensorrt_llm/pybind/runtime/generationInput.h b/cpp/tensorrt_llm/pybind/runtime/generationInput.h new file mode 100644 index 0000000000..d975dba2ff --- /dev/null +++ b/cpp/tensorrt_llm/pybind/runtime/generationInput.h @@ -0,0 +1,66 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/runtime/common.h" +#include "tensorrt_llm/runtime/generationInput.h" + +#include + +#include +#include +#include + +namespace tensorrt_llm::pybind::runtime +{ + +using SizeType = tensorrt_llm::runtime::SizeType; + +class PromptTuningParams : public tensorrt_llm::runtime::GenericPromptTuningParams> +{ +public: + using Base = tensorrt_llm::runtime::GenericPromptTuningParams>; + using TensorPtr = Base::TensorPtr; + using SizeType = Base::SizeType; + + explicit PromptTuningParams( + TensorPtr embeddingTable = TensorPtr(), TensorPtr tasks = TensorPtr(), TensorPtr vocabSize = TensorPtr()) + : GenericPromptTuningParams(std::move(embeddingTable), std::move(tasks), std::move(vocabSize)) + { + } + + [[nodiscard]] std::shared_ptr toTrtLlm() const; +}; + +class GenerationInput + : public tensorrt_llm::runtime::GenericGenerationInput, PromptTuningParams> +{ +public: + using Base = tensorrt_llm::runtime::GenericGenerationInput, PromptTuningParams>; + using TensorPtr = Base::TensorPtr; + + explicit GenerationInput( + SizeType const endId, SizeType const padId, TensorPtr ids, TensorPtr lengths, bool packed = false) + : GenericGenerationInput(endId, padId, std::move(ids), std::move(lengths), packed) + { + } + + [[nodiscard]] std::shared_ptr toTrtLlm() const; +}; +} // namespace tensorrt_llm::pybind::runtime diff --git a/cpp/tensorrt_llm/pybind/runtime/generationOutput.cpp b/cpp/tensorrt_llm/pybind/runtime/generationOutput.cpp new file mode 100644 index 0000000000..e6d97b9833 --- /dev/null +++ b/cpp/tensorrt_llm/pybind/runtime/generationOutput.cpp @@ -0,0 +1,39 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "generationOutput.h" + +#include "tensorrt_llm/runtime/torchView.h" + +namespace tr = tensorrt_llm::runtime; + +using namespace tensorrt_llm::pybind::runtime; + +std::shared_ptr GenerationOutput::toTrtLlm() const +{ + auto output + = std::make_shared(tr::TorchView::of(ids.value()), tr::TorchView::of(lengths.value())); + if (logProbs) + { + output->logProbs = tr::TorchView::of(logProbs.value()); + } + if (contextLogits) + { + output->contextLogits = tr::TorchView::of(contextLogits.value()); + } + // TODO(mseznec): add support for onTokenGenerated + return output; +} diff --git a/cpp/tensorrt_llm/pybind/runtime/generationOutput.h b/cpp/tensorrt_llm/pybind/runtime/generationOutput.h new file mode 100644 index 0000000000..ce34d7fc1c --- /dev/null +++ b/cpp/tensorrt_llm/pybind/runtime/generationOutput.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "tensorrt_llm/runtime/generationOutput.h" + +#include +#include + +namespace tensorrt_llm::pybind::runtime +{ + +class GenerationOutput : public tensorrt_llm::runtime::GenericGenerationOutput> +{ +public: + using Base = tensorrt_llm::runtime::GenericGenerationOutput>; + using TensorPtr = Base::TensorPtr; + + explicit GenerationOutput(TensorPtr ids, TensorPtr lengths) + : GenericGenerationOutput(std::move(ids), std::move(lengths)) + { + } + + [[nodiscard]] std::shared_ptr toTrtLlm() const; +}; + +} // namespace tensorrt_llm::pybind::runtime diff --git a/cpp/tensorrt_llm/runtime/CMakeLists.txt b/cpp/tensorrt_llm/runtime/CMakeLists.txt index 00ae3c80d2..6f59553341 100644 --- a/cpp/tensorrt_llm/runtime/CMakeLists.txt +++ b/cpp/tensorrt_llm/runtime/CMakeLists.txt @@ -17,6 +17,7 @@ include(FetchContent) set(SRCS utils/numpyUtils.cpp utils/sessionUtils.cpp + utils/debugUtils.cu bufferManager.cpp decodingOutput.cpp gptDecoder.cpp @@ -28,6 +29,7 @@ set(SRCS ipcUtils.cpp memoryCounters.cpp ncclCommunicator.cpp + promptTuningParams.cpp runtimeBuffers.cpp runtimeKernels.cu statefulGptDecoder.cpp diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp index ad4588a4fa..ec2dc1b027 100644 --- a/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp +++ b/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp @@ -212,9 +212,28 @@ void GptDecoderBatch::newRequest( TensorPtr endIdTensorPtr{ITensor::slice(constPointerCast(dJointInput.endIds), batchIdx, localBatchSize)}; kernels::invokeFill(*endIdTensorPtr, endId, *stream); dInput = std::make_unique(inputLength, localBatchSize, dJointInput.logits, endIdTensorPtr); - dInput->embeddingBias = request.embeddingBias; - dInput->badWordsList = request.badWordsList; - dInput->stopWordsList = request.stopWordsList; + + // Here, we need to add leading 1 dimension since decoderInput expects batchSize as leading dim + // and decoder_batch::Request doesn't have batch dimension + if (request.embeddingBias) + { + TensorPtr biasView = ITensor::view(request.embeddingBias); + biasView->unsqueeze(0); + dInput->embeddingBias = biasView; + } + if (request.badWordsList) + { + TensorPtr badWordsView = ITensor::view(request.badWordsList); + badWordsView->unsqueeze(0); + dInput->badWordsList = badWordsView; + } + if (request.stopWordsList) + { + TensorPtr stopWordsView = ITensor::view(request.stopWordsList); + stopWordsView->unsqueeze(0); + dInput->stopWordsList = stopWordsView; + } + TensorPtr sequenceLimitLength{ ITensor::slice(constPointerCast(dJointInput.sequenceLimitLength), batchIdx, localBatchSize)}; kernels::invokeFill(*sequenceLimitLength, inputLength + maxNewTokens, *stream); @@ -437,10 +456,20 @@ void GptDecoderBatch::newBatch(GenerationInput const& inputs, SamplingConfig con inputView = ITensor::slice(inputs.ids, batchIdx, 1); inputView->reshape(inputShape); } - auto request = decoder_batch::Request{inputView, std::nullopt, inputs.endId, inputs.padId}; - request.embeddingBias = inputs.embeddingBiasOpt; - request.badWordsList = inputs.badWordsList; - request.stopWordsList = inputs.stopWordsList; + auto request = decoder_batch::Request{inputView, inputs.maxNewTokens, inputs.endId, inputs.padId}; + + if (inputs.embeddingBiasOpt) + { + TLLM_THROW("newBatch doesn't support embeddingBias yet."); + } + if (inputs.badWordsList) + { + TLLM_THROW("newBatch doesn't support badWordsList yet."); + } + if (inputs.stopWordsList) + { + TLLM_THROW("newBatch doesn't support stopWordsList yet."); + } newRequest(batchIdx, request, extractSamplingConfig(samplingConfig, batchIdx)); } TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); diff --git a/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp b/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp index 1f0169f7ce..3860e4b611 100644 --- a/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp +++ b/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp @@ -38,9 +38,10 @@ FieldType parseJsonFieldOr(Json const& json, std::string_view name, FieldType de { value = json.at(name).template get(); } - catch (nlohmann::json::out_of_range&) + catch (nlohmann::json::out_of_range& e) { - // std::cerr << e.what() << '\n'; + TLLM_LOG_WARNING("Parameter %s cannot be read from json:", std::string(name).c_str()); + TLLM_LOG_WARNING(e.what()); } return value; } @@ -102,6 +103,8 @@ GptJsonConfig parseJson(InputType&& i) auto const maxInputLen = parseJsonFieldOr(builderConfig, "max_input_len", 0); auto const maxOutputLen = parseJsonFieldOr(builderConfig, "max_output_len", 0); auto const maxNumTokens = parseJsonFieldOptional(builderConfig, "max_num_tokens"); + auto const maxPromptEmbeddingTableSize + = parseJsonFieldOr(builderConfig, "max_prompt_embedding_table_size", 0); auto const computeContextLogits = parseJsonFieldOr(builderConfig, "gather_all_token_logits", false); @@ -127,11 +130,12 @@ GptJsonConfig parseJson(InputType&& i) modelConfig.setMaxInputLen(maxInputLen); modelConfig.setMaxOutputLen(maxOutputLen); modelConfig.setMaxNumTokens(maxNumTokens); + modelConfig.setMaxPromptEmbeddingTableSize(maxPromptEmbeddingTableSize); if (name == std::string("chatglm-6b")) { modelConfig.setModelVariant(GptModelConfig::ModelVariant::kGlm); - // kGlm is only for ChatGLM-6B, not for ChatGLM2-6B + // kGlm is only for ChatGLM-6B and Glm-10B } return GptJsonConfig{name, precision, tensorParallelism, pipelineParallelism, modelConfig}; diff --git a/cpp/tensorrt_llm/runtime/gptSession.cpp b/cpp/tensorrt_llm/runtime/gptSession.cpp index d4cb14aaf2..68c70d377a 100644 --- a/cpp/tensorrt_llm/runtime/gptSession.cpp +++ b/cpp/tensorrt_llm/runtime/gptSession.cpp @@ -21,6 +21,7 @@ #include "iBuffer.h" #include "tensorrt_llm/batch_manager/kvCacheManager.h" +#include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/kernels/decodingKernels.h" #include "tensorrt_llm/runtime/gptDecoderBatch.h" #include "tensorrt_llm/runtime/ipcUtils.h" @@ -48,7 +49,6 @@ GptSession::GptSession(Config const& sessionConfig, GptModelConfig const& modelC , mDevice{utils::initDevice(worldConfig)} , mLogger{logger ? std::move(logger) : std::make_shared()} , mRuntime{std::make_shared(engineBuffer, engineSize, *mLogger)} - , mNumMicroBatches{worldConfig.getPipelineParallelism()} , mDecoders{} , mBuffers{} , mCudaGraphInstances{} @@ -59,6 +59,9 @@ GptSession::GptSession(Config const& sessionConfig, GptModelConfig const& modelC mCommStream = std::make_shared(); } + TLLM_CHECK_WITH_INFO(!(mModelConfig.usePromptTuning() && !mModelConfig.useGptAttentionPlugin()), + "Prompt tuning is only enabled with GPT attention plugin."); + // TODO compare expected and runtime tensor names? setup(sessionConfig); @@ -74,7 +77,7 @@ BufferManager& GptSession::getBufferManager() const return mRuntime->getBufferManager(); } -void GptSession::createContexts(SizeType numMicroBatches, bool useCudaGraphs) +void GptSession::createContexts(SizeType numCtxBatches, SizeType numGenBatches, bool useCudaGraphs) { TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); mRuntime->clearContexts(); @@ -82,31 +85,22 @@ void GptSession::createContexts(SizeType numMicroBatches, bool useCudaGraphs) if (useCudaGraphs) { // Instantiate multiple graph instances for flip-flopping - mCudaGraphInstances.resize(2 * numMicroBatches); + mCudaGraphInstances.resize(2 * numGenBatches); } auto const numProfiles = mRuntime->getNbProfiles(); TLLM_CHECK_WITH_INFO( numProfiles == 1 || numProfiles == 2, "GPT only expects one optimization profile or two optimization profiles"); - if (numProfiles == 2) - { - auto constexpr ctxContextId = 0; - auto constexpr genContextId = 1; - // Instantiate 2 contexts for flip-flopping - for (auto i = 0; i < 2 * numMicroBatches; ++i) - mRuntime->addContext(genContextId); - // Instantiate 1 context for context phase - for (auto i = 0; i < numMicroBatches; ++i) - mRuntime->addContext(ctxContextId); - } - else - { - auto constexpr contextId = 0; - // Instantiate 2 contexts for flip-flopping - for (auto i = 0; i < 2 * numMicroBatches; ++i) - mRuntime->addContext(contextId); - } + auto constexpr ctxContextId = 0; + auto const genContextId = static_cast(numProfiles == 2); + // Instantiate 2 contexts for flip-flopping + for (auto i = 0; i < 2 * numGenBatches; ++i) + mRuntime->addContext(genContextId); + // Instantiate 1 context for context phase + for (auto i = 0; i < numCtxBatches; ++i) + mRuntime->addContext(ctxContextId); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } @@ -184,11 +178,48 @@ void GptSession::createCustomAllReduceWorkspace( { setPeerAccess(mWorldConfig, true); + mIpcMemoryHandles.clear(); + const std::size_t bufferSize = static_cast(maxBatchSize) * maxBeamWidth * maxSequenceLength + * mModelConfig.getHiddenSize() * mWorldConfig.getTensorParallelism() * sizeof(float); + mIpcMemoryHandles.emplace_back(std::make_shared(mWorldConfig, bufferSize)); + mIpcMemoryHandles.emplace_back(std::make_shared(mWorldConfig, IpcMemory::FLAGS_SIZE * sizeof(int32_t))); + mIpcMemoryHandles.emplace_back(std::make_shared(mWorldConfig, IpcMemory::FLAGS_SIZE * sizeof(int32_t))); + auto& manager = mRuntime->getBufferManager(); - for (const auto& buffer : mBuffers) + mCommPtrs = manager.cpu( + ITensor::makeShape({static_cast(mIpcMemoryHandles.size()) * mWorldConfig.getTensorParallelism()}), + nvinfer1::DataType::kINT64); + const auto commPtrsData = bufferCast(*mCommPtrs); + + for (size_t memIdx = 0; memIdx < mIpcMemoryHandles.size(); memIdx++) { - buffer->createCustomAllReduceWorkspace( - maxBatchSize, maxBeamWidth, maxSequenceLength, mModelConfig.getHiddenSize(), mWorldConfig, manager); + const auto& memCommPtrs = mIpcMemoryHandles[memIdx]->getCommPtrsTensor(); + for (SizeType tpIdx = 0; tpIdx < mWorldConfig.getTensorParallelism(); tpIdx++) + { + commPtrsData[memIdx * mWorldConfig.getTensorParallelism() + tpIdx] = memCommPtrs[tpIdx]; + } + } +} + +GptSession::MicroBatchConfig::MicroBatchConfig(SizeType maxBatchSize, SizeType pipelineParallelism, + std::optional genMicroBatchSize, std::optional ctxMicroBatchSize) +{ + if (genMicroBatchSize || ctxMicroBatchSize) + { + genBatchSize = genMicroBatchSize.value_or(maxBatchSize); + TLLM_CHECK(genBatchSize <= maxBatchSize); + ctxBatchSize = ctxMicroBatchSize.value_or(genBatchSize); + TLLM_CHECK_WITH_INFO(genBatchSize % ctxBatchSize == 0, + tc::fmtstr( + "Generation batch size (%d) must be divisible by context batch size (%d)", genBatchSize, ctxBatchSize) + .c_str()); + numGenBatches = tc::ceilDiv(maxBatchSize, genBatchSize); + numCtxBatches = numGenBatches * (genBatchSize / ctxBatchSize); + } + else + { + numCtxBatches = numGenBatches = pipelineParallelism; + ctxBatchSize = genBatchSize = tc::ceilDiv(maxBatchSize, numGenBatches); } } @@ -202,12 +233,12 @@ void GptSession::setup(Config const& sessionConfig) auto const maxBeamWidth = sessionConfig.maxBeamWidth; auto const maxSequenceLength = sessionConfig.maxSequenceLength; - if (sessionConfig.numMicroBatches) - mNumMicroBatches = sessionConfig.numMicroBatches.value(); - createContexts(mNumMicroBatches, sessionConfig.cudaGraphMode); - createBuffers(mNumMicroBatches); + mMicroBatchConfig = MicroBatchConfig(maxBatchSize, mWorldConfig.getPipelineParallelism(), + sessionConfig.genMicroBatchSize, sessionConfig.ctxMicroBatchSize); + + createContexts(mMicroBatchConfig.numCtxBatches, mMicroBatchConfig.numGenBatches, sessionConfig.cudaGraphMode); + createBuffers(mMicroBatchConfig.numGenBatches); - auto const microBatchSize = tc::ceilDiv(maxBatchSize, mNumMicroBatches); // Store this param related to decoder buffer size and kv cache manager to check against // the input shape with the params given in generate(). // gptDecoderBatch does not resize buffers, but allows smaller batchSize and beamWidth. @@ -222,28 +253,29 @@ void GptSession::setup(Config const& sessionConfig) if (mWorldConfig.isLastPipelineParallelRank()) { auto const logitsType = mRuntime->getEngine().getTensorDataType("logits"); - createDecoders(microBatchSize, maxBeamWidth, maxSequenceLength, logitsType, sessionConfig.decoderPerRequest, - mNumMicroBatches); + createDecoders(mMicroBatchConfig.genBatchSize, maxBeamWidth, maxSequenceLength, logitsType, + sessionConfig.decoderPerRequest, mMicroBatchConfig.numGenBatches); } - if (mWorldConfig.isPipelineParallel() || mNumMicroBatches > 1) + if (mWorldConfig.isPipelineParallel() || mMicroBatchConfig.numGenBatches > 1) { mReceivedEvents.clear(); - for (SizeType i = 0; i < mNumMicroBatches; ++i) + for (SizeType i = 0; i < mMicroBatchConfig.numGenBatches; ++i) mReceivedEvents.emplace_back(); } if (mWorldConfig.isTensorParallel() && mModelConfig.useCustomAllReduce()) { - createCustomAllReduceWorkspace(microBatchSize, maxBeamWidth, maxSequenceLength); + createCustomAllReduceWorkspace(mMicroBatchConfig.genBatchSize, maxBeamWidth, maxSequenceLength); } - // we don't know maxInputLength and maxNewTokens yet and ignore those for pre-allocation - auto const generationConfig - = RuntimeBuffers::GenerationConfig{microBatchSize, maxBeamWidth, 0, 0, maxSequenceLength}; - for (auto& buffers : mBuffers) - buffers->reshape(generationConfig, mModelConfig, mWorldConfig); + { + // we don't know maxInputLength yet and ignore it for pre-allocation + buffers->generationConfig + = RuntimeBuffers::GenerationConfig{mMicroBatchConfig.genBatchSize, maxBeamWidth, 0, maxSequenceLength}; + buffers->reshape(mModelConfig, mWorldConfig); + } TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } @@ -263,8 +295,8 @@ void GptSession::kvCacheAddSequences(SizeType beamWidth, SizeType microBatchId, } } -ITensor::SharedPtr GptSession::initNewTokens( - GenerationInput const& inputs, SamplingConfig const& samplingConfig, SizeType microBatchId) +ITensor::SharedPtr GptSession::initDecoder(ITensor& outputIds, GenerationInput const& inputs, + SamplingConfig const& samplingConfig, SizeType microBatchId) const { if (mWorldConfig.isLastPipelineParallelRank()) { @@ -274,9 +306,29 @@ ITensor::SharedPtr GptSession::initNewTokens( } else if (mWorldConfig.isFirstPipelineParallelRank()) { + auto& manager = mRuntime->getBufferManager(); + auto const& stream = mRuntime->getStreamPtr(); + + auto const inputLengths = inputs.lengths; + auto const batchSize = static_cast(inputLengths->getSize()); + + auto const inputLengthsHost = manager.copyFrom(*inputLengths, MemoryType::kCPU); + auto const* inputLengthsData = bufferCast(*inputLengthsHost); + SizeType const maxInputLength = *std::max_element(inputLengthsData, inputLengthsData + inputLengths->getSize()); + + ITensor::SharedPtr inputOffsets = manager.emptyTensor(MemoryType::kGPU, TRTDataType::value); + if (inputs.packed) + { + inputOffsets->reshape(ITensor::makeShape({batchSize + 1})); + manager.setZero(*inputOffsets); + kernels::invokeInclusiveSum(*ITensor::slice(inputOffsets, 1), *inputLengths, manager, *stream); + } + + kernels::initOutputIds(outputIds, *inputs.ids, *inputLengths, *inputOffsets, inputs.padId, inputs.endId, + maxInputLength, inputs.packed, *stream); + auto const beamWidth = samplingConfig.beamWidth; - auto const batchSize = static_cast(inputs.lengths->getSize()); - return mRuntime->getBufferManager().gpu(ITensor::makeShape({batchSize, beamWidth}), nvinfer1::DataType::kINT32); + return manager.gpu(ITensor::makeShape({batchSize, beamWidth}), nvinfer1::DataType::kINT32); } else { @@ -286,32 +338,34 @@ ITensor::SharedPtr GptSession::initNewTokens( namespace { -std::vector splitInputs( - GenerationInput const& inputs, SizeType numMicroBatches, BufferManager& manager) +std::tuple, std::vector, std::vector> splitInputIds( + GenerationInput const& inputs, SizeType microBatchSize, BufferManager& manager) { - std::vector inputBatches; auto const numRequests = inputs.lengths->getShape().d[0]; - auto const microBatchSize = tc::ceilDiv(numRequests, numMicroBatches); + std::vector inputIds; + std::vector inputLengths; + std::vector microBatchOffsets(1, 0); if (inputs.packed) { - auto contextLengthsHost = manager.copyFrom(*inputs.lengths, MemoryType::kCPU); + auto const contextLengthsHost = manager.copyFrom(*inputs.lengths, MemoryType::kCPU); ITensor::SharedPtr inputIdsView = ITensor::view(inputs.ids); inputIdsView->squeeze(0); - auto contextLengthsRange = BufferRange(*contextLengthsHost); + auto const contextLengthsRange = BufferRange(*contextLengthsHost); auto tokensBegin = 0; for (auto offset = 0; offset < numRequests; offset += microBatchSize) { - auto batchSize = std::min(microBatchSize, numRequests - offset); - auto numTokens = std::accumulate( + auto const batchSize = std::min(microBatchSize, numRequests - offset); + auto const numTokens = std::accumulate( contextLengthsRange.begin() + offset, contextLengthsRange.begin() + offset + batchSize, 0); ITensor::SharedPtr batchInputs = ITensor::slice(inputIdsView, tokensBegin, numTokens); batchInputs->reshape(ITensor::makeShape({1, numTokens})); - inputBatches.emplace_back(inputs.endId, inputs.padId, batchInputs, - ITensor::slice(inputs.lengths, offset, batchSize), inputs.packed); + inputIds.emplace_back(std::move(batchInputs)); + inputLengths.emplace_back(ITensor::slice(inputs.lengths, offset, batchSize)); + microBatchOffsets.emplace_back(offset + batchSize); tokensBegin += numTokens; } @@ -320,24 +374,66 @@ std::vector splitInputs( { for (auto offset = 0; offset < numRequests; offset += microBatchSize) { - auto batchSize = std::min(microBatchSize, numRequests - offset); - inputBatches.emplace_back(inputs.endId, inputs.padId, ITensor::slice(inputs.ids, offset, batchSize), - ITensor::slice(inputs.lengths, offset, batchSize), inputs.packed); + auto const batchSize = std::min(microBatchSize, numRequests - offset); + + inputIds.emplace_back(ITensor::slice(inputs.ids, offset, batchSize)); + inputLengths.emplace_back(ITensor::slice(inputs.lengths, offset, batchSize)); + microBatchOffsets.emplace_back(offset + batchSize); } } - for (auto& batch : inputBatches) + return {inputIds, inputLengths, microBatchOffsets}; +} + +std::vector splitInputs(GenerationInput const& inputs, SizeType microBatchSize, BufferManager& manager) +{ + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); + auto [inputIds, inputLengths, microBatchOffsets] = splitInputIds(inputs, microBatchSize, manager); + + std::vector inputBatches; + for (std::size_t batchId = 0; batchId < inputIds.size(); ++batchId) { + inputBatches.emplace_back( + inputs.endId, inputs.padId, std::move(inputIds[batchId]), std::move(inputLengths[batchId]), inputs.packed); + } + + for (std::size_t batchId = 0; batchId < inputBatches.size(); ++batchId) + { + auto& batch = inputBatches[batchId]; + auto const offset = microBatchOffsets[batchId]; + auto const batchSize = microBatchOffsets[batchId + 1] - offset; + if (inputs.embeddingBiasOpt) batch.embeddingBiasOpt = inputs.embeddingBiasOpt; if (inputs.badWordsList) - batch.badWordsList = inputs.badWordsList; + { + auto const& shape = inputs.badWordsList->getShape(); + if (shape.nbDims == 2) + { + batch.badWordsList = inputs.badWordsList; + } + else + { + assert(nbDims == 3); + batch.badWordsList = ITensor::slice(inputs.badWordsList, offset, batchSize); + } + } if (inputs.stopWordsList) - batch.stopWordsList = inputs.stopWordsList; + { + batch.stopWordsList = ITensor::slice(inputs.stopWordsList, offset, batchSize); + } if (inputs.maxNewTokens) batch.maxNewTokens = inputs.maxNewTokens; + + if (inputs.promptTuningParams.embeddingTable) + batch.promptTuningParams.embeddingTable = inputs.promptTuningParams.embeddingTable; + if (inputs.promptTuningParams.tasks) + batch.promptTuningParams.tasks = ITensor::slice(inputs.promptTuningParams.tasks, offset, batchSize); + if (inputs.promptTuningParams.vocabSize) + batch.promptTuningParams.vocabSize = inputs.promptTuningParams.vocabSize; } + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); return inputBatches; } @@ -381,40 +477,33 @@ void GptSession::generate( outputs.contextLogits->reshape(ITensor::makeShape({batchSize, maxInputLength, vocabSizePadded})); } - auto const numMicroBatches = std::min(batchSize, mNumMicroBatches); - if (numMicroBatches == 1) + if (batchSize <= mMicroBatchConfig.genBatchSize) { std::vector microBatches{inputs}; generateBatched(outputs, microBatches, samplingConfig); } else { - auto const microBatches = splitInputs(inputs, numMicroBatches, manager); + auto const microBatches = splitInputs(inputs, mMicroBatchConfig.genBatchSize, manager); generateBatched(outputs, microBatches, samplingConfig); } TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } -std::function GptSession::createOnTokenGeneratedCallback( - GenerationOutput& outputs, SizeType numMicroBatches) +std::function GptSession::createOnTokenGeneratedCallback(GenerationOutput& outputs) { if (outputs.onTokenGenerated && mWorldConfig.isFirstPipelineParallelRank()) { - ITensor::SharedPtr outputIds{mWorldConfig.isPipelineParallel() || mNumMicroBatches > 1 + ITensor::SharedPtr outputIds{mWorldConfig.isPipelineParallel() || mMicroBatchConfig.numGenBatches > 1 ? outputs.ids : mDecoders.front()->getOutputIds()}; - auto const lastMicroBatchId = numMicroBatches - 1; - return [onTokenGenerated = outputs.onTokenGenerated, outputIds = std::move(outputIds), lastMicroBatchId]( - SizeType microBatchId, SizeType step, bool finished) - { - if (microBatchId == lastMicroBatchId) - onTokenGenerated(outputIds, step, finished); - }; + return [onTokenGenerated = outputs.onTokenGenerated, outputIds = std::move(outputIds)]( + SizeType step, bool finished) { onTokenGenerated(outputIds, step, finished); }; } else { - return [](SizeType microBatchId, SizeType step, bool finished) {}; + return [](SizeType step, bool finished) {}; } } @@ -426,52 +515,50 @@ void GptSession::generateBatched( auto& manager = mRuntime->getBufferManager(); auto const numMicroBatches = static_cast(microBatches.size()); TLLM_CHECK(numMicroBatches > 0); - TLLM_CHECK(numMicroBatches <= mNumMicroBatches); + TLLM_CHECK(numMicroBatches <= mMicroBatchConfig.numGenBatches); SizeType const beamWidth{samplingConfig.beamWidth}; // Initialize and reshape buffers - std::vector generationConfigs; for (auto microBatchId = 0; microBatchId < numMicroBatches; ++microBatchId) { auto const& microBatchInputs = microBatches.at(microBatchId); auto& buffers = *mBuffers.at(microBatchId); - TLLM_CHECK_WITH_INFO(buffers.allocated, "Buffers not allocated, please call setup first!"); - buffers.initContextLengths(microBatchInputs.lengths, manager); - generationConfigs.emplace_back( - RuntimeBuffers::GenerationConfig::fromInput(*microBatchInputs.ids, *buffers.contextLengthsHost, - microBatchInputs.packed, beamWidth, mDecoderMaxSequenceLength, microBatchInputs.maxNewTokens)); - buffers.reshape(generationConfigs.back(), mModelConfig, mWorldConfig); + buffers.initFromInput(*microBatchInputs.ids, microBatchInputs.lengths, microBatchInputs.packed, beamWidth, + mDecoderMaxSequenceLength, manager); + buffers.reshape(mModelConfig, mWorldConfig); + buffers.reset(manager); } - auto minMaxNewTokens = std::numeric_limits::max(); std::vector microBatchOffsets(1, 0); microBatchOffsets.reserve(numMicroBatches + 1); for (auto microBatchId = 0; microBatchId < numMicroBatches; ++microBatchId) { - auto const& generationConfig = generationConfigs.at(microBatchId); - minMaxNewTokens = std::min(minMaxNewTokens, generationConfig.maxNewTokens); + auto const& generationConfig = mBuffers.at(microBatchId)->generationConfig; microBatchOffsets.emplace_back(microBatchOffsets.back() + generationConfig.batchSize); } for (auto microBatchId = 0; microBatchId < numMicroBatches; ++microBatchId) { auto& buffers = *mBuffers.at(microBatchId); - auto const& generationConfig = generationConfigs.at(microBatchId); auto const batchOffset = microBatchOffsets.at(microBatchId); kvCacheAddSequences(beamWidth, microBatchId, batchOffset); auto const& microBatchInputs = microBatches.at(microBatchId); - buffers.newTokens = initNewTokens(microBatchInputs, samplingConfig, microBatchId); - auto const microBatchSize = generationConfig.batchSize; + auto const microBatchSize = buffers.generationConfig.batchSize; buffers.outputIds = ITensor::slice(outputs.ids, batchOffset, microBatchSize); buffers.outputLengths = ITensor::slice(outputs.lengths, batchOffset, microBatchSize); + buffers.newTokens = initDecoder(*buffers.outputIds, microBatchInputs, samplingConfig, microBatchId); if (mWorldConfig.isLastPipelineParallelRank() && mModelConfig.computeContextLogits()) { buffers.logits = ITensor::slice(outputs.contextLogits, batchOffset, microBatchSize); } + if (mModelConfig.usePromptTuning()) + { + buffers.promptTuningParams = microBatchInputs.promptTuningParams; + } } // Prepare the onTokenGenerated callback - auto const onTokenGenerated = createOnTokenGeneratedCallback(outputs, numMicroBatches); + auto const onTokenGenerated = createOnTokenGeneratedCallback(outputs); if (useCudaGraphs()) { @@ -483,101 +570,25 @@ void GptSession::generateBatched( auto kvCacheManager = mModelConfig.usePagedKvCache() ? mKvCacheManager.get() : nullptr; - std::vector inputBuffers(numMicroBatches * 2); - std::vector outputBuffers(numMicroBatches * 2); + executeContextStep(microBatches, microBatchOffsets, kvCacheManager); + std::vector microBatchesFinished(numMicroBatches, false); - auto notFinished = [µBatchesFinished]() - { return std::any_of(microBatchesFinished.begin(), microBatchesFinished.end(), [](bool x) { return !x; }); }; - - for (SizeType step = 0; step < minMaxNewTokens && notFinished(); ++step) + SizeType numBatchesFinished{0}; + SizeType step{0}; + while (numBatchesFinished < numMicroBatches) { - auto const flipFlopId = step % 2; - for (auto microBatchId = 0; microBatchId < numMicroBatches; ++microBatchId) - { - if (microBatchesFinished.at(microBatchId)) - continue; + ++step; + numBatchesFinished + += executeGenerationStep(step, microBatches, microBatchOffsets, kvCacheManager, microBatchesFinished); - auto& buffers = *mBuffers.at(microBatchId); - auto& generationConfig = generationConfigs.at(microBatchId); - - auto const contextId = flipFlopId * numMicroBatches + microBatchId; - auto& inputBuffer = inputBuffers[contextId]; - auto& outputBuffer = outputBuffers[contextId]; - - if (step == 0) - { - SizeType const contextIdForContextPhase - = (mRuntime->getNbProfiles() == 2 ? 2 * mNumMicroBatches : 0) + microBatchId; - - auto const& microBatchInputs = microBatches.at(microBatchId); - buffers.prepareContextStep(microBatchInputs.ids, microBatchInputs.padId, manager, kvCacheManager, - microBatchOffsets.at(microBatchId), generationConfig, mModelConfig, mWorldConfig); - buffers.getRuntimeBuffers( - inputBuffer, outputBuffer, step, microBatchInputs.ids, mModelConfig, mWorldConfig); - mRuntime->setInputTensors(contextIdForContextPhase, inputBuffer); - mRuntime->setOutputTensors(contextIdForContextPhase, outputBuffer); - - TLLM_CHECK_WITH_INFO( - mRuntime->executeContext(contextIdForContextPhase), "Executing TRT engine in context step failed!"); - sync_check_cuda_error(); - - buffers.postContextStep(manager, generationConfig, mModelConfig, mWorldConfig); - sync_check_cuda_error(); - } - else - { - auto nextInputIds = buffers.prepareNextStep(step - 1, manager, kvCacheManager, - microBatchOffsets.at(microBatchId), generationConfig, mModelConfig, mWorldConfig); - buffers.getRuntimeBuffers(inputBuffer, outputBuffer, step, nextInputIds, mModelConfig, mWorldConfig); - mRuntime->setInputTensors(contextId, inputBuffer); - mRuntime->setOutputTensors(contextId, outputBuffer); - - if (useCudaGraphs()) - { - mCudaGraphInstances.at(contextId).prepareNextGraph(*mRuntime, contextId); - } - - // check decoder result of previous iteration - auto const microBatchSize = generationConfig.batchSize; - auto const shouldStop = shouldStopSync(microBatchSize, beamWidth, microBatchId); - onTokenGenerated(microBatchId, step - 1, shouldStop); - - if (shouldStop) - { - mLogger->log(nvinfer1::ILogger::Severity::kVERBOSE, "GPT decoding finished early"); - microBatchesFinished.at(microBatchId) = true; - continue; - } - - if (useCudaGraphs()) - { - auto& cudaGraphInstance = mCudaGraphInstances.at(contextId); - TLLM_CHECK(cudaGraphInstance.hasInstance()); - cudaGraphInstance.launch(mRuntime->getStream()); - } - else - { - TLLM_CHECK_WITH_INFO(mRuntime->executeContext(contextId), - tc::fmtstr("Executing TRT engine in step %d failed!", step)); - } - sync_check_cuda_error(); - } - - std::swap(buffers.cacheIndirectionDecoderInput, buffers.cacheIndirectionDecoderOutput); - - auto const maxInputLength = generationConfigs.at(microBatchId).maxInputLength; - auto const decoderStep = maxInputLength + step; - decoderStepAsync(decoderStep, microBatchId); - } + onTokenGenerated(step - 1, numBatchesFinished == numMicroBatches); } // Collect the results for the last step for (auto microBatchId = 0; microBatchId < numMicroBatches; ++microBatchId) { - auto const& generationConfig = generationConfigs.at(microBatchId); + auto const& generationConfig = mBuffers.at(microBatchId)->generationConfig; auto const microBatchSize = generationConfig.batchSize; - auto const shouldStop = shouldStopSync(microBatchSize, beamWidth, microBatchId); - onTokenGenerated(microBatchId, minMaxNewTokens - 1, shouldStop); auto const firstBatchIdx = microBatchOffsets.at(microBatchId); if (mModelConfig.usePagedKvCache()) @@ -594,10 +605,129 @@ void GptSession::generateBatched( else if (!mWorldConfig.isPipelineParallel()) manager.copy(*mDecoders.at(microBatchId)->getOutputIds(), *mBuffers.at(microBatchId)->outputIds); } + manager.getStream().synchronize(); TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } +void GptSession::executeContextStep(std::vector const& generationBatches, + std::vector const& generationBatchOffsets, KvCacheManager const* kvCacheManager) +{ + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); + auto& manager = mRuntime->getBufferManager(); + + auto const numGenerationBatches = static_cast(generationBatches.size()); + auto constexpr step = 0; + for (auto generationBatchId = 0; generationBatchId < numGenerationBatches; ++generationBatchId) + { + auto const& generationBatchInputs = generationBatches.at(generationBatchId); + auto& generationBuffers = *mBuffers.at(generationBatchId); + + auto const contextBatchSize = mMicroBatchConfig.ctxBatchSize; + auto [inputIds, inputLengths, contextBatchOffsets] + = splitInputIds(generationBatchInputs, contextBatchSize, manager); + auto contextBuffers = generationBuffers.split(contextBatchSize, mModelConfig, mWorldConfig); + TLLM_CHECK(inputIds.size() == contextBuffers.size()); + auto const numContextBatches = static_cast(contextBuffers.size()); + + for (auto contextBatchId = 0; contextBatchId < numContextBatches; ++contextBatchId) + { + auto batchOffset = generationBatchOffsets.at(generationBatchId) + contextBatchOffsets.at(contextBatchId); + auto& buffers = contextBuffers.at(contextBatchId); + auto& inputBuffer = buffers.inputBuffers[0]; + auto& outputBuffer = buffers.outputBuffers[0]; + + auto const contextId = mMicroBatchConfig.getCtxContextId(generationBatchId, contextBatchId); + + buffers.prepareContextStep(inputIds.at(contextBatchId), generationBatchInputs.padId, manager, + kvCacheManager, batchOffset, mModelConfig, mWorldConfig); + buffers.getRuntimeBuffers( + inputBuffer, outputBuffer, step, inputIds.at(contextBatchId), mCommPtrs, mModelConfig, mWorldConfig); + mRuntime->setInputTensors(contextId, inputBuffer); + mRuntime->setOutputTensors(contextId, outputBuffer); + + TLLM_CHECK_WITH_INFO(mRuntime->executeContext(contextId), "Executing TRT engine in context step failed!"); + sync_check_cuda_error(); + } + + generationBuffers.postContextStep(contextBuffers, manager, mModelConfig, mWorldConfig); + sync_check_cuda_error(); + + std::swap(generationBuffers.cacheIndirectionDecoderInput, generationBuffers.cacheIndirectionDecoderOutput); + + auto const decoderStep = generationBuffers.generationConfig.maxInputLength + step; + decoderStepAsync(decoderStep, generationBatchId); + } + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); +} + +SizeType GptSession::executeGenerationStep(SizeType step, std::vector const& microBatches, + std::vector const& microBatchOffsets, KvCacheManager* kvCacheManager, + std::vector& microBatchesFinished) +{ + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); + auto& manager = mRuntime->getBufferManager(); + + auto const numMicroBatches = static_cast(microBatches.size()); + SizeType numBatchesFinished{0}; + + auto const flipFlopId = step % 2; + for (auto generationBatchId = 0; generationBatchId < numMicroBatches; ++generationBatchId) + { + if (microBatchesFinished.at(generationBatchId)) + continue; + + auto& buffers = *mBuffers.at(generationBatchId); + auto const& generationConfig = buffers.generationConfig; + + auto const contextId = mMicroBatchConfig.getGenContextId(flipFlopId, generationBatchId); + auto& inputBuffer = buffers.inputBuffers[flipFlopId]; + auto& outputBuffer = buffers.outputBuffers[flipFlopId]; + + auto nextInputIds = buffers.prepareNextStep( + step - 1, manager, kvCacheManager, microBatchOffsets.at(generationBatchId), mModelConfig, mWorldConfig); + buffers.getRuntimeBuffers(inputBuffer, outputBuffer, step, nextInputIds, mCommPtrs, mModelConfig, mWorldConfig); + mRuntime->setInputTensors(contextId, inputBuffer); + mRuntime->setOutputTensors(contextId, outputBuffer); + + if (useCudaGraphs()) + { + mCudaGraphInstances.at(contextId).prepareNextGraph(*mRuntime, contextId); + } + + // check decoder result of previous iteration + if (shouldStopSync(generationConfig.batchSize, generationConfig.beamWidth, generationBatchId)) + { + mLogger->log(nvinfer1::ILogger::Severity::kVERBOSE, + tc::fmtstr("GPT decoding finished for step %d and microBatchId %d", step, generationBatchId).c_str()); + microBatchesFinished.at(generationBatchId) = true; + numBatchesFinished += 1; + continue; + } + + if (useCudaGraphs()) + { + auto& cudaGraphInstance = mCudaGraphInstances.at(contextId); + TLLM_CHECK(cudaGraphInstance.hasInstance()); + cudaGraphInstance.launch(mRuntime->getStream()); + } + else + { + TLLM_CHECK_WITH_INFO( + mRuntime->executeContext(contextId), tc::fmtstr("Executing TRT engine in step %d failed!", step)); + } + sync_check_cuda_error(); + + std::swap(buffers.cacheIndirectionDecoderInput, buffers.cacheIndirectionDecoderOutput); + + auto const decoderStep = generationConfig.maxInputLength + step; + decoderStepAsync(decoderStep, generationBatchId); + } + + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); + return numBatchesFinished; +} + void GptSession::decoderStepAsync(SizeType decoderStep, SizeType microBatchId) { TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); @@ -662,7 +792,7 @@ void GptSession::decoderStepAsync(SizeType decoderStep, SizeType microBatchId) mCommStream->record(mReceivedEvents.at(microBatchId).get()); } - if (!mWorldConfig.isPipelineParallel() && mNumMicroBatches > 1) + if (!mWorldConfig.isPipelineParallel() && mMicroBatchConfig.numGenBatches > 1) { updateOutputIds(outputIds, newTokens, decoderStep, stream); stream.record(mReceivedEvents.at(microBatchId).get()); @@ -684,7 +814,7 @@ bool GptSession::shouldStopSync(SizeType batchSize, SizeType beamWidth, SizeType decoder.forwardSync(); nbFinished = *bufferCast(*decoder.getNbFinished()); - if (!mWorldConfig.isPipelineParallel() && mNumMicroBatches > 1) + if (!mWorldConfig.isPipelineParallel() && mMicroBatchConfig.numGenBatches > 1) { // ensure outputIds have been updated mReceivedEvents.at(microBatchId).synchronize(); diff --git a/cpp/tensorrt_llm/runtime/memoryCounters.cpp b/cpp/tensorrt_llm/runtime/memoryCounters.cpp index 7f741ab60a..48e6d3a277 100644 --- a/cpp/tensorrt_llm/runtime/memoryCounters.cpp +++ b/cpp/tensorrt_llm/runtime/memoryCounters.cpp @@ -57,6 +57,12 @@ std::string MemoryCounters::bytesToString(DiffType bytes, int precision) return doubleBytesToString(static_cast(bytes), precision); } +std::string MemoryCounters::toString() const +{ + return tensorrt_llm::common::fmtstr("[MemUsage] GPU %s, CPU %s, Pinned %s", bytesToString(this->getGpu()).c_str(), + bytesToString(this->getCpu()).c_str(), bytesToString(this->getPinned()).c_str()); +} + void MemoryCounters::allocate(MemoryType memoryType, MemoryCounters::SizeType size) { switch (memoryType) diff --git a/cpp/tensorrt_llm/runtime/promptTuningParams.cpp b/cpp/tensorrt_llm/runtime/promptTuningParams.cpp new file mode 100644 index 0000000000..60074ad025 --- /dev/null +++ b/cpp/tensorrt_llm/runtime/promptTuningParams.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/runtime/promptTuningParams.h" + +namespace tensorrt_llm::runtime +{ + +void PromptTuningParams::fillTasksTensor(TensorPtr tasksHost, const SizeType batchSize, + const SizeType numContextRequests, const std::vector& reqBeamWidths, + const std::vector& reqPromptLengths, BufferManager& manager, bool packedInput) +{ + auto const& tasksHostShape = tasksHost->getShape(); + TLLM_CHECK_WITH_INFO(tasksHostShape.nbDims == 1, "tasksHost expected to have dimension [batchSize]"); + TLLM_CHECK_WITH_INFO(tasksHostShape.d[0] == batchSize, "tasksHost expected to have dimension [batchSize]"); + + auto const tasksHostPtr = bufferCast(*tasksHost); + + bool validInput = packedInput || numContextRequests == batchSize || numContextRequests == 0; + TLLM_CHECK_WITH_INFO(validInput, + "fillTasksTensor function with packed inputs must be called with only context requests or only generation " + "requests."); + + bool validShapes = (static_cast(reqBeamWidths.size()) == batchSize + && static_cast(reqPromptLengths.size()) == numContextRequests + && static_cast(promptTuningEnabled.size()) == batchSize); + TLLM_CHECK_WITH_INFO(validShapes, + "Invalid inputs to fillTasksTensor function. reqBeamWidths and reqPtuningEnabled size must be batchSize and " + "propmtLenghts size must be numContextRequests"); + + SizeType totalInputSize = 0; + std::vector promptTasksHost; + for (SizeType bid = 0; bid < batchSize; bid++) + { + SizeType taskId = promptTuningEnabled[bid] ? tasksHostPtr[bid] : 0; + if (packedInput) + { + if (bid < numContextRequests) + { + totalInputSize += reqPromptLengths[bid]; + promptTasksHost.insert(promptTasksHost.end(), reqPromptLengths[bid], taskId); + } + else + { + for (SizeType beam = 0; beam < reqBeamWidths[bid]; ++beam) + { + promptTasksHost.insert(promptTasksHost.end(), 1, taskId); + totalInputSize++; + } + } + } + else + { + if (bid < numContextRequests) + { + promptTasksHost.push_back(taskId); + ++totalInputSize; + } + else + { + promptTasksHost.insert(promptTasksHost.end(), reqBeamWidths[bid], taskId); + totalInputSize += reqBeamWidths[bid]; + } + } + } + + if (packedInput) + { + tasks = manager.copyFrom( + promptTasksHost, runtime::ITensor::makeShape({1, totalInputSize}), runtime::MemoryType::kGPU); + } + else + { + tasks = manager.copyFrom( + promptTasksHost, runtime::ITensor::makeShape({totalInputSize, 1}), runtime::MemoryType::kGPU); + } +} + +} // namespace tensorrt_llm::runtime diff --git a/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp b/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp index 31b64a20df..6d7ab16d72 100644 --- a/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp +++ b/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp @@ -16,7 +16,6 @@ #include "tensorrt_llm/runtime/runtimeBuffers.h" -#include "ipcUtils.h" #include "tensorrt_llm/batch_manager/kvCacheManager.h" #include "tensorrt_llm/common/stlUtils.h" #include "tensorrt_llm/runtime/runtimeKernels.h" @@ -30,8 +29,7 @@ using namespace tensorrt_llm::runtime; namespace tc = tensorrt_llm::common; RuntimeBuffers::GenerationConfig RuntimeBuffers::GenerationConfig::fromInput(ITensor const& inputIds, - ITensor const& inputLengthsHost, bool const inputPacked, SizeType const beamWidth, SizeType const maxSequenceLength, - std::optional const& maxNewTokensOpt) + ITensor const& inputLengthsHost, bool const inputPacked, SizeType const beamWidth, SizeType const maxSequenceLength) { TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); auto const batchSize = static_cast(inputLengthsHost.getSize()); @@ -54,13 +52,12 @@ RuntimeBuffers::GenerationConfig RuntimeBuffers::GenerationConfig::fromInput(ITe maxInputLength = inputShape.d[1]; } - auto const maxNewTokens = maxNewTokensOpt.value_or(maxSequenceLength - maxInputLength); - TLLM_CHECK_WITH_INFO(1 <= maxNewTokens && maxNewTokens <= maxSequenceLength - maxInputLength, + TLLM_CHECK_WITH_INFO(maxInputLength < maxSequenceLength, "Max input length is equal to or larger that maxSequenceLength given in setup. No new tokens can be " "generated."); TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); - return GenerationConfig{batchSize, beamWidth, maxInputLength, maxNewTokens, maxSequenceLength, inputLengthSum}; + return GenerationConfig{batchSize, beamWidth, maxInputLength, maxSequenceLength, inputLengthSum}; } void RuntimeBuffers::clear() @@ -91,6 +88,16 @@ void RuntimeBuffers::clear() TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } +void RuntimeBuffers::clearTensorMaps() +{ + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); + for (auto& buffer : inputBuffers) + buffer.clear(); + for (auto& buffer : outputBuffers) + buffer.clear(); + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); +} + void RuntimeBuffers::create(TllmRuntime& runtime, GptModelConfig const& modelConfig, WorldConfig const& worldConfig) { TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); @@ -171,41 +178,19 @@ void RuntimeBuffers::create(TllmRuntime& runtime, GptModelConfig const& modelCon TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } -void RuntimeBuffers::initContextLengths(TensorPtr const& inputLengths, BufferManager& manager) +void RuntimeBuffers::initFromInput(ITensor const& inputIds, TensorPtr const& inputLengths, bool inputPacked, + SizeType beamWidth, SizeType maxSequenceLength, BufferManager& manager) { contextLengthsDevice = inputLengths; contextLengthsHost->reshape(inputLengths->getShape()); manager.copy(*contextLengthsDevice, *contextLengthsHost); manager.getStream().synchronize(); // wait for context lengths to be copied to host + + generationConfig = RuntimeBuffers::GenerationConfig::fromInput( + inputIds, *contextLengthsHost, inputPacked, beamWidth, maxSequenceLength); } -void RuntimeBuffers::createCustomAllReduceWorkspace(SizeType maxBatchSize, SizeType maxBeamWidth, - SizeType maxSequenceLength, SizeType hiddenSize, WorldConfig const& worldConfig, BufferManager& manager) -{ - mIpcMemoryHandles.clear(); - const std::size_t bufferSize = static_cast(maxBatchSize) * maxBeamWidth * maxSequenceLength - * hiddenSize * worldConfig.getTensorParallelism() * sizeof(float); - mIpcMemoryHandles.emplace_back(std::make_shared(worldConfig, bufferSize)); - mIpcMemoryHandles.emplace_back(std::make_shared(worldConfig, IpcMemory::FLAGS_SIZE * sizeof(int32_t))); - mIpcMemoryHandles.emplace_back(std::make_shared(worldConfig, IpcMemory::FLAGS_SIZE * sizeof(int32_t))); - - commPtrs = manager.cpu( - ITensor::makeShape({static_cast(mIpcMemoryHandles.size()) * worldConfig.getTensorParallelism()}), - nvinfer1::DataType::kINT64); - const auto commPtrsData = bufferCast(*commPtrs); - - for (size_t memIdx = 0; memIdx < mIpcMemoryHandles.size(); memIdx++) - { - const auto& memCommPtrs = mIpcMemoryHandles[memIdx]->getCommPtrsTensor(); - for (SizeType tpIdx = 0; tpIdx < worldConfig.getTensorParallelism(); tpIdx++) - { - commPtrsData[memIdx * worldConfig.getTensorParallelism() + tpIdx] = memCommPtrs[tpIdx]; - } - } -} - -void RuntimeBuffers::reshape( - GenerationConfig const& generationConfig, GptModelConfig const& modelConfig, WorldConfig const& worldConfig) +void RuntimeBuffers::reshape(GptModelConfig const& modelConfig, WorldConfig const& worldConfig) { TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); @@ -222,8 +207,10 @@ void RuntimeBuffers::reshape( lastTokenIds->reshape(ITensor::makeShape({batchSize})); - auto kvCacheShape + auto kvCacheReserve = ITensor::makeShape({batchSize, 2, modelConfig.getNbKvHeads(), maxSeqLength, modelConfig.getSizePerHead()}); + auto kvCacheShape + = ITensor::makeShape({batchSize, 2, modelConfig.getNbKvHeads(), maxInputLength, modelConfig.getSizePerHead()}); if (modelConfig.usePagedKvCache()) { auto const localNbLayers = modelConfig.getNbLayers(worldConfig.getPipelineParallelism()); @@ -240,7 +227,7 @@ void RuntimeBuffers::reshape( } else { - utils::reshapeBufferVector(presentKeysVals, kvCacheShape); + utils::reshapeBufferVector(presentKeysVals, kvCacheReserve); } if (modelConfig.useGptAttentionPlugin()) @@ -250,7 +237,10 @@ void RuntimeBuffers::reshape( } else { - utils::reshapeBufferVector(presentKeysValsAlt, kvCacheShape); + utils::reshapeBufferVector(presentKeysValsAlt, kvCacheReserve); + // present KV cache tensors will be reshaped by shape inference. + // reshape to the required shape here to make context batch slicing work correctly. + utils::reshapeBufferVector(presentKeysVals, kvCacheShape); } auto const cacheIndirShape = ITensor::makeShape({batchSize, beamWidth, maxSeqLength}); @@ -260,9 +250,9 @@ void RuntimeBuffers::reshape( if (worldConfig.isPipelineParallel()) { // reserve max size - auto const maxNumTokens = std::max(batchSize * beamWidth, batchSize * maxInputLength); + auto const maxNumTokens = std::max(beamWidth, maxInputLength); auto const hiddenSize = modelConfig.getHiddenSize() * worldConfig.getTensorParallelism(); - auto const hiddenStatesShape = ITensor::makeShape({1, maxNumTokens, hiddenSize}); + auto const hiddenStatesShape = ITensor::makeShape({batchSize, maxNumTokens, hiddenSize}); hiddenStates->reshape(hiddenStatesShape); } @@ -270,8 +260,104 @@ void RuntimeBuffers::reshape( TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } -void RuntimeBuffers::gatherLastTokenLogits(BufferManager& manager, GenerationConfig const& generationConfig, - GptModelConfig const& modelConfig, WorldConfig const& worldConfig) +void RuntimeBuffers::reset(BufferManager& manager) +{ + clearTensorMaps(); + manager.setZero(*cacheIndirectionDecoderInput); + manager.setZero(*cacheIndirectionDecoderOutput); +} + +std::vector RuntimeBuffers::split( + SizeType contextBatchSize, GptModelConfig const& modelConfig, WorldConfig const& worldConfig) +{ + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); + + std::vector bufferSlices; + auto const generationBatchSize = generationConfig.batchSize; + bufferSlices.reserve(tc::ceilDiv(generationBatchSize, contextBatchSize)); + if (contextBatchSize >= generationBatchSize) + { + bufferSlices.emplace_back(*this); + } + else + { + for (auto offset = 0; offset < generationBatchSize; offset += contextBatchSize) + { + auto const batchSize = std::min(contextBatchSize, generationBatchSize - offset); + auto& buffers = bufferSlices.emplace_back(); + buffers.generationConfig = generationConfig; + buffers.generationConfig.batchSize = batchSize; + + buffers.contextLengthsHost = ITensor::slice(contextLengthsHost, offset, batchSize); + buffers.contextLengthsDevice = ITensor::slice(contextLengthsDevice, offset, batchSize); + + if (worldConfig.isLastPipelineParallelRank() && !modelConfig.computeContextLogits()) + { + buffers.logits = ITensor::slice(logits, offset, batchSize); + } + + buffers.lastTokenIds = ITensor::slice(lastTokenIds, offset, batchSize); + + if (modelConfig.usePagedKvCache()) + { + auto const& realCacheBlockPointersShape = kvCacheBlockPointersHost->getShape(); + auto const localNbLayers = realCacheBlockPointersShape.d[0]; + auto const maxBlocksPerSeq = realCacheBlockPointersShape.d[3]; + + // enable slicing by moving generationBatchSize to first dim + auto const fakeCacheBlockPointersShape + = ITensor::makeShape({generationBatchSize, localNbLayers, 2, maxBlocksPerSeq}); + TensorPtr kvCacheBlockPointersHostView{ + ITensor::view(kvCacheBlockPointersHost, fakeCacheBlockPointersShape)}; + TensorPtr kvCacheBlockPointersDeviceView{ + ITensor::view(kvCacheBlockPointersDevice, fakeCacheBlockPointersShape)}; + + // slice and reshape to correct shape + auto const cacheBlockPointersShape = ITensor::makeShape({localNbLayers, batchSize, 2, maxBlocksPerSeq}); + buffers.kvCacheBlockPointersHost = ITensor::slice(kvCacheBlockPointersHostView, offset, batchSize); + buffers.kvCacheBlockPointersHost->reshape(cacheBlockPointersShape); + buffers.kvCacheBlockPointersDevice = ITensor::slice(kvCacheBlockPointersDeviceView, offset, batchSize); + buffers.kvCacheBlockPointersDevice->reshape(cacheBlockPointersShape); + } + else + { + buffers.presentKeysVals = utils::sliceBufferVector(presentKeysVals, offset, batchSize); + } + + if (modelConfig.useGptAttentionPlugin()) + { + buffers.pastKeyValueLengths = ITensor::slice(pastKeyValueLengths, offset, batchSize); + buffers.requestTypes = ITensor::slice(requestTypes, offset, batchSize); + } + else + { + buffers.presentKeysValsAlt = utils::sliceBufferVector(presentKeysValsAlt, offset, batchSize); + } + + if (worldConfig.isPipelineParallel()) + { + buffers.hiddenStates = ITensor::slice(hiddenStates, offset, batchSize); + } + + buffers.cacheIndirectionDecoderOutput = ITensor::slice(cacheIndirectionDecoderOutput, offset, batchSize); + + if (modelConfig.usePromptTuning()) + { + auto const& ptuningEnabled = promptTuningParams.promptTuningEnabled; + buffers.promptTuningParams.promptTuningEnabled + = std::vector(ptuningEnabled.begin() + offset, ptuningEnabled.begin() + offset + batchSize); + + buffers.promptTuningParams.tasks = ITensor::slice(promptTuningParams.tasks, offset, batchSize); + } + } + } + + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); + return bufferSlices; +} + +void RuntimeBuffers::gatherLastTokenLogits( + BufferManager& manager, GptModelConfig const& modelConfig, WorldConfig const& worldConfig) { TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); TLLM_CHECK_WITH_INFO(modelConfig.computeContextLogits(), @@ -294,8 +380,29 @@ void RuntimeBuffers::gatherLastTokenLogits(BufferManager& manager, GenerationCon TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } -void RuntimeBuffers::tile(BufferManager& manager, GenerationConfig const& generationConfig, - GptModelConfig const& modelConfig, WorldConfig const& worldConfig) +void RuntimeBuffers::copyAttentionMasks(std::vector const& contextBatches, BufferManager& manager) +{ + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); + auto const batchSize = generationConfig.batchSize; + auto const maxInputLength = generationConfig.maxInputLength; + + // TODO(rkobus) include tiling + attentionMask = manager.gpu(ITensor::makeShape({batchSize, maxInputLength}), nvinfer1::DataType::kINT32); + + auto const numContextBatches = static_cast(contextBatches.size()); + auto offset = 0; + for (auto contextBatchId = 0; contextBatchId < numContextBatches; ++contextBatchId) + { + auto& buffers = contextBatches.at(contextBatchId); + auto contextBatchSize = buffers.generationConfig.batchSize; + auto attentionMaskSlice = ITensor::slice(attentionMask, offset, contextBatchSize); + manager.copy(*buffers.attentionMask, *attentionMaskSlice); + offset += contextBatchSize; + } + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); +} + +void RuntimeBuffers::tile(BufferManager& manager, GptModelConfig const& modelConfig, WorldConfig const& worldConfig) { TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); auto const beamWidth = generationConfig.beamWidth; @@ -333,7 +440,7 @@ void RuntimeBuffers::tile(BufferManager& manager, GenerationConfig const& genera TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } -void RuntimeBuffers::postContextStep(BufferManager& manager, GenerationConfig const& generationConfig, +void RuntimeBuffers::postContextStep(std::vector const& contextBuffers, BufferManager& manager, GptModelConfig const& modelConfig, WorldConfig const& worldConfig) { TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); @@ -346,15 +453,22 @@ void RuntimeBuffers::postContextStep(BufferManager& manager, GenerationConfig co auto hostRequestTypes = bufferCast(*requestTypes); std::fill_n(hostRequestTypes, requestTypes->getSize(), 1); } + else + { + copyAttentionMasks(contextBuffers, manager); + } + + // TODO(rkobus) handle this more gracefully + positionIds = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32); if (modelConfig.computeContextLogits()) { - gatherLastTokenLogits(manager, generationConfig, modelConfig, worldConfig); + gatherLastTokenLogits(manager, modelConfig, worldConfig); } if (beamWidth > 1) { - tile(manager, generationConfig, modelConfig, worldConfig); + tile(manager, modelConfig, worldConfig); } // use output lengths after context step @@ -371,12 +485,25 @@ void RuntimeBuffers::postContextStep(BufferManager& manager, GenerationConfig co kvCacheBlockPointersHost->reshape(cacheBlockPointersShape); kvCacheBlockPointersDevice->reshape(cacheBlockPointersShape); } + + if (modelConfig.usePromptTuning()) + { + std::vector reqBeamWidths(batchSize, beamWidth); + //// Note: reqPromptLenghts won't be used + std::vector reqPromptLengths; + // Copy the generationInput tasks to host + promptTuningTasksHost = manager.copyFrom(*promptTuningParams.tasks, MemoryType::kPINNED); + // Update the promptTuningParams tasks tensor + promptTuningParams.fillTasksTensor(promptTuningTasksHost, batchSize, 0, reqBeamWidths, reqPromptLengths, + manager, modelConfig.usePackedInput()); + } + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } void RuntimeBuffers::prepareContextStep(TensorPtr const& inputIds, TokenIdType const padId, BufferManager& manager, - KvCacheManager const* kvCacheManager, SizeType firstBatchSlotIdx, GenerationConfig const& generationConfig, - GptModelConfig const& modelConfig, WorldConfig const& worldConfig) + KvCacheManager const* kvCacheManager, SizeType firstBatchSlotIdx, GptModelConfig const& modelConfig, + WorldConfig const& worldConfig) { TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); auto& stream = manager.getStream(); @@ -391,12 +518,10 @@ void RuntimeBuffers::prepareContextStep(TensorPtr const& inputIds, TokenIdType c auto pastKeyValueLengthsPtr = bufferCast(*pastKeyValueLengths); TLLM_CHECK(pastKeyValueLengths->getSize() == static_cast(batchSize)); std::fill_n(pastKeyValueLengthsPtr, batchSize, 0); - if (modelConfig.useGptAttentionPlugin()) - { - auto RequestTypesPtr = bufferCast(*requestTypes); - TLLM_CHECK(requestTypes->getSize() == static_cast(batchSize)); - std::fill_n(RequestTypesPtr, batchSize, 0); - } + + auto RequestTypesPtr = bufferCast(*requestTypes); + TLLM_CHECK(requestTypes->getSize() == static_cast(batchSize)); + std::fill_n(RequestTypesPtr, batchSize, 0); auto const& inputShape = inputIds->getShape(); auto const contextLengthsHostPtr = bufferCast(*contextLengthsHost); @@ -417,10 +542,19 @@ void RuntimeBuffers::prepareContextStep(TensorPtr const& inputIds, TokenIdType c } else if (modelVariant == GptModelConfig::ModelVariant::kGlm) { - auto const positionIdsVec = getPositionIdsContextPhaseGlm( - batchSize, maxInputLength, contextLengthsHostPtr, modelConfig.useGptAttentionPlugin()); - auto const positionIdsShape = ITensor::makeShape({batchSize, 2, maxInputLength}); - positionIds = manager.copyFrom(positionIdsVec, positionIdsShape, MemoryType::kGPU); + auto const positionIdsVec = getPositionIdsContextPhaseGlm(batchSize, maxInputLength, contextLengthsHostPtr, + modelConfig.useGptAttentionPlugin(), modelConfig.usePackedInput()); + if (modelConfig.usePackedInput()) + { + int num_tokens = (int) positionIdsVec.size() / 2; + auto const positionIdsShape = ITensor::makeShape({1, 2, num_tokens}); + positionIds = manager.copyFrom(positionIdsVec, positionIdsShape, MemoryType::kGPU); + } + else + { + auto const positionIdsShape = ITensor::makeShape({batchSize, 2, maxInputLength}); + positionIds = manager.copyFrom(positionIdsVec, positionIdsShape, MemoryType::kGPU); + } } else { @@ -433,6 +567,23 @@ void RuntimeBuffers::prepareContextStep(TensorPtr const& inputIds, TokenIdType c auto const hiddenStatesShape = ITensor::makeShape({inputShape.d[0], inputShape.d[1], hiddenSize}); hiddenStates->reshape(hiddenStatesShape); } + + if (modelConfig.usePromptTuning()) + { + std::vector reqBeamWidths(batchSize, 1); + std::vector reqPromptLengths; + for (SizeType i = 0; i < batchSize; ++i) + { + reqPromptLengths.push_back(contextLengthsHostPtr[i]); + } + + // Copy the generationInput tasks to host + promptTuningTasksHost = manager.copyFrom(*promptTuningParams.tasks, MemoryType::kPINNED); + + // Update the tasks tensor + promptTuningParams.fillTasksTensor(promptTuningTasksHost, batchSize, batchSize, reqBeamWidths, + reqPromptLengths, manager, modelConfig.usePackedInput()); + } } else { @@ -470,14 +621,12 @@ void RuntimeBuffers::prepareContextStep(TensorPtr const& inputIds, TokenIdType c manager.copy(*contextLengthsDevice, *lastTokenIds); } - manager.setZero(*cacheIndirectionDecoderInput); - manager.setZero(*cacheIndirectionDecoderOutput); TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } RuntimeBuffers::TensorPtr RuntimeBuffers::prepareNextStep(SizeType const step, BufferManager& manager, - KvCacheManager* kvCacheManager, SizeType firstBatchSlotIdx, GenerationConfig const& generationConfig, - GptModelConfig const& modelConfig, WorldConfig const& worldConfig) + KvCacheManager* kvCacheManager, SizeType firstBatchSlotIdx, GptModelConfig const& modelConfig, + WorldConfig const& worldConfig) { TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); auto& stream = manager.getStream(); @@ -519,10 +668,18 @@ RuntimeBuffers::TensorPtr RuntimeBuffers::prepareNextStep(SizeType const step, B } else if (modelVariant == GptModelConfig::ModelVariant::kGlm) { - auto const positionIdsVec = getPositionIdsGenerationPhaseGlm( - batchSize, beamWidth, step, contextLengthsHostPtr, modelConfig.useGptAttentionPlugin()); - auto const positionIdsShape = ITensor::makeShape({batchSize * beamWidth, 2, 1}); - positionIds = manager.copyFrom(positionIdsVec, positionIdsShape, MemoryType::kGPU); + auto const positionIdsVec = getPositionIdsGenerationPhaseGlm(batchSize, beamWidth, step, + contextLengthsHostPtr, modelConfig.useGptAttentionPlugin(), modelConfig.usePackedInput()); + if (modelConfig.usePackedInput()) + { + auto const positionIdsShape = ITensor::makeShape({1, 2, batchSize * beamWidth}); + positionIds = manager.copyFrom(positionIdsVec, positionIdsShape, MemoryType::kGPU); + } + else + { + auto const positionIdsShape = ITensor::makeShape({batchSize * beamWidth, 2, 1}); + positionIds = manager.copyFrom(positionIdsVec, positionIdsShape, MemoryType::kGPU); + } } else { @@ -538,7 +695,7 @@ RuntimeBuffers::TensorPtr RuntimeBuffers::prepareNextStep(SizeType const step, B } else { - auto const shape = attentionMask->getShape(); + auto const& shape = attentionMask->getShape(); auto const nbInputs = shape.d[0]; auto const oldLength = shape.d[1]; auto const newLength = oldLength + 1; @@ -583,13 +740,13 @@ RuntimeBuffers::TensorPtr RuntimeBuffers::prepareNextStep(SizeType const step, B { kernels::invokeInclusiveSum(*lastTokenIds, *lastTokenIds, manager, stream); } - TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); return nextInputIds; } void RuntimeBuffers::getRuntimeBuffers(TensorMap& inputBuffers, TensorMap& outputBuffers, SizeType const step, - TensorPtr const& inputIds, GptModelConfig const& modelConfig, WorldConfig const& worldConfig) const + TensorPtr const& inputIds, TensorPtr const& commPtrs, GptModelConfig const& modelConfig, + WorldConfig const& worldConfig) const { TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); inputBuffers.clear(); @@ -676,49 +833,110 @@ void RuntimeBuffers::getRuntimeBuffers(TensorMap& inputBuffers, TensorMap& outpu { inputBuffers.insert_or_assign("all_reduce_workspace", commPtrs); } + + if (modelConfig.usePromptTuning()) + { + inputBuffers.insert_or_assign("prompt_embedding_table", promptTuningParams.embeddingTable); + inputBuffers.insert_or_assign("tasks", promptTuningParams.tasks); + inputBuffers.insert_or_assign("prompt_vocab_size", promptTuningParams.vocabSize); + } TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } -std::vector RuntimeBuffers::getPositionIdsContextPhaseGlm( - SizeType batchSize, SizeType maxInputLength, SizeType const* pInputLengths, bool useGptAttentionPlugin) +std::vector RuntimeBuffers::getPositionIdsContextPhaseGlm(const SizeType& batchSize, + const SizeType& maxInputLength, const SizeType* pInputLengths, bool useGptAttentionPlugin, bool usePackedInput) { TLLM_CHECK(pInputLengths != nullptr); - auto const size = batchSize * 2 * maxInputLength; - std::vector positionIdsVec(size, 0); - - for (SizeType b = 0; b < batchSize; ++b) + std::vector positionIdsVec(1, 0); + if (useGptAttentionPlugin) { - auto* pIdB = positionIdsVec.data() + b * 2 * maxInputLength; - auto const length = pInputLengths[b]; - std::iota(pIdB, pIdB + length, 0); - - pIdB[length - 1] = length - 2; - pIdB[length - 1 + maxInputLength] = 1; - } - - return positionIdsVec; -} - -std::vector RuntimeBuffers::getPositionIdsGenerationPhaseGlm( - SizeType batchSize, SizeType beamSize, SizeType step, SizeType const* pInputLengths, bool useGptAttentionPlugin) -{ - TLLM_CHECK(pInputLengths != nullptr); - - auto const size = batchSize * beamSize * 2; - std::vector positionIdsVec(size, 0); - - for (SizeType b = 0; b < batchSize; ++b) - { - auto* pIdB = positionIdsVec.data() + b * beamSize * 2; - auto const length = pInputLengths[b * beamSize]; - - for (SizeType bm = 0; bm < beamSize; ++bm) + if (usePackedInput) { - pIdB[bm * 2 + 0] = length - 2; - pIdB[bm * 2 + 1] = step + 2; + std::vector pInputLengthsAcc = std::vector(batchSize + 1, 0); + for (int i = 0; i < batchSize; ++i) + { + pInputLengthsAcc[i + 1] = pInputLengthsAcc[i] + pInputLengths[i]; + } + + auto const size = 1 * 2 * pInputLengthsAcc[batchSize]; + positionIdsVec.resize(size, 0); + for (SizeType b = 0; b < batchSize; ++b) + { + auto* pIdB = positionIdsVec.data() + pInputLengthsAcc[b]; + auto const length = pInputLengths[b]; + std::iota(pIdB, pIdB + length, 0); + + pIdB[length - 1] = length - 2; + pIdB[length - 1 + pInputLengthsAcc[batchSize]] = 1; + } + } + else + { + auto const size = batchSize * 2 * maxInputLength; + positionIdsVec.resize(size, 0); + for (SizeType b = 0; b < batchSize; ++b) + { + auto* pIdB = positionIdsVec.data() + b * 2 * maxInputLength; + auto const length = pInputLengths[b]; + std::iota(pIdB, pIdB + length, 0); + + pIdB[length - 1] = length - 2; + pIdB[length - 1 + maxInputLength] = 1; + } } } + else + { + TLLM_THROW("Unsupported model without GPT Attention Plugin"); + } + + return positionIdsVec; +} + +std::vector RuntimeBuffers::getPositionIdsGenerationPhaseGlm(const SizeType& batchSize, + const SizeType& beamSize, const SizeType& step, const SizeType* pInputLengths, bool useGptAttentionPlugin, + bool usePackedInput) +{ + TLLM_CHECK(pInputLengths != nullptr); + + auto const size = 2 * batchSize * beamSize; + std::vector positionIdsVec(size, 0); + if (useGptAttentionPlugin) + { + if (usePackedInput) + { + for (SizeType b = 0; b < batchSize; ++b) + { + auto* pIdB = positionIdsVec.data() + b * beamSize * 2; + auto const length = pInputLengths[b * beamSize]; + + for (SizeType bm = 0; bm < beamSize; ++bm) + { + pIdB[bm * 2 + 0] = length - 2; + pIdB[bm * 2 + 1] = step + 2; + } + } + } + else + { + for (SizeType b = 0; b < batchSize; ++b) + { + auto* pIdB = positionIdsVec.data() + b * beamSize * 2; + auto const length = pInputLengths[b * beamSize]; + + for (SizeType bm = 0; bm < beamSize; ++bm) + { + pIdB[bm * 2 + 0] = length - 2; + pIdB[bm * 2 + 1] = step + 2; + } + } + } + } + else + { + TLLM_THROW("Unsupported model without GPT Attention Plugin"); + } return positionIdsVec; } diff --git a/cpp/tensorrt_llm/runtime/runtimeBuffers.h b/cpp/tensorrt_llm/runtime/runtimeBuffers.h index 96b31143d4..72b59ef364 100644 --- a/cpp/tensorrt_llm/runtime/runtimeBuffers.h +++ b/cpp/tensorrt_llm/runtime/runtimeBuffers.h @@ -19,8 +19,12 @@ #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/gptModelConfig.h" #include "tensorrt_llm/runtime/iTensor.h" +#include "tensorrt_llm/runtime/promptTuningParams.h" #include "tensorrt_llm/runtime/worldConfig.h" +#include +#include + namespace tensorrt_llm::batch_manager::kv_cache_manager { class KVCacheManager; @@ -28,7 +32,6 @@ class KVCacheManager; namespace tensorrt_llm::runtime { -class IpcMemory; class TllmRuntime; class RuntimeBuffers @@ -40,11 +43,39 @@ protected: public: using TensorMap = StringPtrMap; + class GenerationConfig + { + public: + GenerationConfig() = default; + + explicit GenerationConfig(SizeType batchSize, SizeType beamWidth, SizeType maxInputLength, + SizeType maxSeqLength, SizeType inputLengthSum = SizeType(0)) + : batchSize{batchSize} + , beamWidth{beamWidth} + , maxInputLength{maxInputLength} + , maxSeqLength{maxSeqLength} + , inputLengthSum{inputLengthSum} + { + } + + SizeType batchSize{}; + SizeType beamWidth{}; + SizeType maxInputLength{}; + SizeType maxSeqLength{}; + SizeType inputLengthSum{}; // Initialized only if inputPacked is set to true in fromInput. + + static GenerationConfig fromInput(ITensor const& inputIds, ITensor const& inputLengths, bool inputPacked, + SizeType beamWidth, SizeType maxSequenceLength); + }; + +public: + GenerationConfig generationConfig{}; + std::array inputBuffers{}; + std::array outputBuffers{}; + // general TensorPtr contextLengthsHost; TensorPtr contextLengthsDevice; - TensorPtr inputOffsets; // helper for packed input - TensorPtr kvCacheBlockPointersHost; // [numLayers, batchSize * beamWidth, 2, maxBlocksPerSeq * 2] // engine TensorPtr logits; @@ -57,6 +88,7 @@ public: std::vector presentKeysVals; std::vector presentKeysValsAlt; // without attention plugin + TensorPtr kvCacheBlockPointersHost; // [numLayers, batchSize * beamWidth, 2, maxBlocksPerSeq * 2] TensorPtr kvCacheBlockPointersDevice; // [numLayers, batchSize * beamWidth, 2, maxBlocksPerSeq * 2] // References to tmp buffers @@ -74,82 +106,58 @@ public: // pipeline parallelism TensorPtr hiddenStates; - // tensor parallelism - TensorPtr commPtrs; + // Prompt tuning + PromptTuningParams promptTuningParams; + TensorPtr promptTuningTasksHost; // Tensor to hold tasks on host bool allocated{false}; -private: - std::vector> mIpcMemoryHandles; - -public: - class GenerationConfig - { - public: - GenerationConfig() = default; - - GenerationConfig(SizeType batchSize, SizeType beamWidth, SizeType maxInputLength, SizeType maxNewTokens, - SizeType maxSeqLength, SizeType inputLengthSum = SizeType(0)) - : batchSize{batchSize} - , beamWidth{beamWidth} - , maxInputLength{maxInputLength} - , maxNewTokens{maxNewTokens} - , maxSeqLength{maxSeqLength} - , inputLengthSum{inputLengthSum} - { - } - - SizeType batchSize{}; - SizeType beamWidth{}; - SizeType maxInputLength{}; - SizeType maxNewTokens{}; - SizeType maxSeqLength{}; - SizeType inputLengthSum{}; // Initialized only if inputPacked is set to true in fromInput. - - static RuntimeBuffers::GenerationConfig fromInput(ITensor const& inputIds, ITensor const& inputLengths, - bool const inputPacked, SizeType const beamWidth, SizeType const maxSequenceLength, - std::optional const& maxNewTokensOpt); - }; - public: void clear(); + void clearTensorMaps(); void create(TllmRuntime& runtime, GptModelConfig const& modelConfig, WorldConfig const& worldConfig); - void initContextLengths(TensorPtr const& inputLengths, BufferManager& manager); + void initFromInput(ITensor const& inputIds, TensorPtr const& inputLengths, bool inputPacked, SizeType beamWidth, + SizeType maxSequenceLength, BufferManager& manager); - void reshape( - GenerationConfig const& generationConfig, GptModelConfig const& modelConfig, WorldConfig const& worldConfig); + //! \brief Reshape buffers based on current GenerationConfig + void reshape(GptModelConfig const& modelConfig, WorldConfig const& worldConfig); - void postContextStep(BufferManager& manager, GenerationConfig const& generationConfig, + void reset(BufferManager& manager); + + std::vector split( + SizeType contextBatchSize, GptModelConfig const& modelConfig, WorldConfig const& worldConfig); + + void postContextStep(std::vector const& contextBuffers, BufferManager& manager, GptModelConfig const& modelConfig, WorldConfig const& worldConfig); void prepareContextStep(TensorPtr const& inputIds, TokenIdType padId, BufferManager& manager, - KvCacheManager const* kvCacheManager, SizeType firstBatchSlotIdx, GenerationConfig const& generationConfig, - GptModelConfig const& modelConfig, WorldConfig const& worldConfig); - TensorPtr prepareNextStep(SizeType step, BufferManager& manager, KvCacheManager* kvCacheManager, - SizeType firstBatchSlotIdx, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig, + KvCacheManager const* kvCacheManager, SizeType firstBatchSlotIdx, GptModelConfig const& modelConfig, WorldConfig const& worldConfig); + TensorPtr prepareNextStep(SizeType step, BufferManager& manager, KvCacheManager* kvCacheManager, + SizeType firstBatchSlotIdx, GptModelConfig const& modelConfig, WorldConfig const& worldConfig); - void getRuntimeBuffers(TensorMap& inputBuffers, TensorMap& outputBuffers, SizeType step, TensorPtr const& inputIds, - GptModelConfig const& modelConfig, WorldConfig const& worldConfig) const; - - void createCustomAllReduceWorkspace(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength, - SizeType hiddenSize, WorldConfig const& worldConfig, BufferManager& manager); + void getRuntimeBuffers(TensorMap& inputBuffers, TensorMap& outputBuffers, SizeType const step, + TensorPtr const& inputIds, TensorPtr const& commPtrs, GptModelConfig const& modelConfig, + WorldConfig const& worldConfig) const; private: - void gatherLastTokenLogits(BufferManager& manager, GenerationConfig const& generationConfig, - GptModelConfig const& modelConfig, WorldConfig const& worldConfig); + void gatherLastTokenLogits( + BufferManager& manager, GptModelConfig const& modelConfig, WorldConfig const& worldConfig); + + void copyAttentionMasks(std::vector const& contextBatches, BufferManager& manager); // Some tensors are properly tiled, some are just reshaped. - void tile(BufferManager& manager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig, - WorldConfig const& worldConfig); + void tile(BufferManager& manager, GptModelConfig const& modelConfig, WorldConfig const& worldConfig); - static std::vector getPositionIdsContextPhaseGlm( - SizeType batchSize, SizeType maxInputLength, SizeType const* pInputLengths, bool useGptAttentionPlugin); + static std::vector getPositionIdsContextPhaseGlm(const SizeType& batchSize, + const SizeType& maxInputLength, const SizeType* pInputLengths, const bool useGptAttentionPlugin, + const bool usePackedInput); - static std::vector getPositionIdsGenerationPhaseGlm(SizeType batchSize, SizeType beamSize, SizeType step, - SizeType const* pInputLengths, bool useGptAttentionPlugin); + static std::vector getPositionIdsGenerationPhaseGlm(const SizeType& batchSize, const SizeType& beamSize, + const SizeType& step, const SizeType* pInputLengths, const bool useGptAttentionPlugin, + const bool usePackedInput); }; } // namespace tensorrt_llm::runtime diff --git a/cpp/tensorrt_llm/runtime/runtimeKernels.cu b/cpp/tensorrt_llm/runtime/runtimeKernels.cu index 068fba246d..45da61f20f 100644 --- a/cpp/tensorrt_llm/runtime/runtimeKernels.cu +++ b/cpp/tensorrt_llm/runtime/runtimeKernels.cu @@ -747,6 +747,24 @@ void invokeCopyPackedInputToOutput(ITensor& outputIds, ITensor const& inputIds, maxInputLength, maxSeqLength); } +void initOutputIds(ITensor& outputIds, ITensor const& inputIds, ITensor const& inputLengths, + ITensor const& inputOffsets, TokenIdType const padId, TokenIdType const endId, SizeType const maxInputLength, + bool const inputPacked, CudaStream const& stream) +{ + TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); + kernels::invokeFill(outputIds, endId, stream); + + if (inputPacked) + { + kernels::invokeCopyPackedInputToOutput(outputIds, inputIds, inputOffsets, maxInputLength, padId, stream); + } + else + { + kernels::invokeCopyInputToOutput(outputIds, inputIds, inputLengths, padId, stream); + } + TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); +} + namespace { template diff --git a/cpp/tensorrt_llm/runtime/runtimeKernels.h b/cpp/tensorrt_llm/runtime/runtimeKernels.h index de1af30274..8b08d68ca0 100644 --- a/cpp/tensorrt_llm/runtime/runtimeKernels.h +++ b/cpp/tensorrt_llm/runtime/runtimeKernels.h @@ -68,6 +68,10 @@ void invokeCopyInputToOutput( void invokeCopyPackedInputToOutput(ITensor& outputIds, ITensor const& inputIds, ITensor const& inputOffsets, SizeType maxInputLength, SizeType padId, CudaStream const& stream); +void initOutputIds(ITensor& outputIds, ITensor const& inputIds, ITensor const& inputLengths, + ITensor const& inputOffsets, TokenIdType padId, TokenIdType endId, SizeType maxInputLength, bool inputPacked, + CudaStream const& stream); + void scatterTensor(ITensor& output, ITensor const& input, SizeType beamWidth, CudaStream const& stream); void tileTensor(ITensor& output, ITensor const& input, SizeType beamWidth, CudaStream const& stream); diff --git a/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp b/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp index 0117a37008..fb438ab934 100644 --- a/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp +++ b/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp @@ -114,32 +114,10 @@ void StatefulGptDecoder::reshapeBuffers(SizeType batchSize, SizeType beamWidth, dOutput.beamHypotheses.release(); } - mMaxNewTokens = 0; mNbSteps = 0; TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } -namespace -{ -void initOutputIds(TensorPtr const& outputIds, TensorPtr const& inputIds, TensorPtr const& inputLengths, - TensorPtr const& inputOffsets, SizeType const padId, SizeType const endId, SizeType const maxInputLength, - bool const inputPacked, CudaStream const& stream) -{ - TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); - kernels::invokeFill(*outputIds, endId, stream); - - if (inputPacked) - { - kernels::invokeCopyPackedInputToOutput(*outputIds, *inputIds, *inputOffsets, maxInputLength, padId, stream); - } - else - { - kernels::invokeCopyInputToOutput(*outputIds, *inputIds, *inputLengths, padId, stream); - } - TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); -} -} // namespace - void StatefulGptDecoder::newBatch(GenerationInput const& inputs, SamplingConfig const& samplingConfig) { TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); @@ -174,11 +152,6 @@ void StatefulGptDecoder::newBatch(GenerationInput const& inputs, SamplingConfig kernels::invokeInclusiveSum(*ITensor::slice(inputOffsets, 1), *inputLengths, manager, *stream); } - mMaxNewTokens = inputs.maxNewTokens.value_or(mMaxSequenceLength - maxInputLength); - TLLM_CHECK_WITH_INFO(maxInputLength + mMaxNewTokens <= mMaxSequenceLength, - tc::fmtstr("Input length (%d) + max new tokens (%d) must be less than max sequence length (%d).", - maxInputLength, mMaxNewTokens, mMaxSequenceLength)); - TLLM_CHECK(inputIds->getDataType() == TRTDataType::value); auto const endId = inputs.endId; auto const padId = inputs.padId; @@ -191,9 +164,21 @@ void StatefulGptDecoder::newBatch(GenerationInput const& inputs, SamplingConfig dInput.embeddingBias = inputs.embeddingBiasOpt; dInput.badWordsList = inputs.badWordsList; dInput.stopWordsList = inputs.stopWordsList; - kernels::invokeFill(const_cast(*dInput.sequenceLimitLength), mMaxSequenceLength, *stream); auto inputLengthsView = ITensor::view(dInput.lengths, ITensor::makeShape({batchSize * beamWidth})); kernels::tileTensor(const_cast(*inputLengthsView), *inputLengths, beamWidth, *stream); + if (inputs.maxNewTokens) + { + auto const maxNewTokens = inputs.maxNewTokens.value(); + TLLM_CHECK_WITH_INFO(maxInputLength + maxNewTokens <= mMaxSequenceLength, + tc::fmtstr("Input length (%d) + max new tokens (%d) must be less than max sequence length (%d).", + maxInputLength, maxNewTokens, mMaxSequenceLength)); + manager.copy(*inputLengths, const_cast(*dInput.sequenceLimitLength)); + kernels::invokeAdd(const_cast(*dInput.sequenceLimitLength), maxNewTokens, *stream); + } + else + { + kernels::invokeFill(const_cast(*dInput.sequenceLimitLength), mMaxSequenceLength, *stream); + } // output auto& dOutput = *mDecodingOutput; @@ -227,8 +212,8 @@ void StatefulGptDecoder::newBatch(GenerationInput const& inputs, SamplingConfig } // copy the request ids into dOutput.ids (with tiling) - initOutputIds( - dOutput.ids, inputIds, inputLengths, inputOffsets, padId, endId, maxInputLength, inputs.packed, *stream); + kernels::initOutputIds( + *dOutput.ids, *inputIds, *inputLengths, *inputOffsets, padId, endId, maxInputLength, inputs.packed, *stream); // remaining mNbSteps = 0; diff --git a/cpp/tensorrt_llm/runtime/statefulGptDecoder.h b/cpp/tensorrt_llm/runtime/statefulGptDecoder.h index 141d5b9a81..0276518ce5 100644 --- a/cpp/tensorrt_llm/runtime/statefulGptDecoder.h +++ b/cpp/tensorrt_llm/runtime/statefulGptDecoder.h @@ -90,6 +90,5 @@ private: SizeType mNbSteps; SizeType mMaxSequenceLength{}; - SizeType mMaxNewTokens; }; } // namespace tensorrt_llm::runtime diff --git a/cpp/tensorrt_llm/runtime/utils/debugUtils.cu b/cpp/tensorrt_llm/runtime/utils/debugUtils.cu new file mode 100644 index 0000000000..8b34ec6612 --- /dev/null +++ b/cpp/tensorrt_llm/runtime/utils/debugUtils.cu @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "debugUtils.h" + +#include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/common/memoryUtils.h" + +namespace +{ + +__global__ void checkTensorNanKernel(const float* data, std::size_t size, int* foundNan) +{ + auto tidx = blockIdx.x * blockDim.x + threadIdx.x; + + int32_t found = 0; + + for (auto idx = tidx; idx < size; idx += blockDim.x * gridDim.x) + { + auto value = data[idx]; + if (isnan(value)) + { + found = 1; + break; + } + } + atomicCAS(foundNan, 0, found); +} +} // namespace + +using namespace tensorrt_llm::runtime; +namespace tc = tensorrt_llm::common; + +namespace tensorrt_llm::runtime::utils +{ + +void invokeCheckTensorNanKernel(const float* data, std::size_t size, int* foundNan, cudaStream_t stream) +{ + constexpr uint32_t kThreadsPerCta = 256; + checkTensorNanKernel<<>>(data, size, foundNan); +} + +bool tensorHasNan(const IBuffer& tensor, BufferManager& manager) +{ + auto foundNan = manager.pinned(ITensor::makeShape({1}), nvinfer1::DataType::kINT32); + auto foundNanPtr = bufferCast(*foundNan); + foundNanPtr[0] = 0; + const auto size = tensor.getSize(); + invokeCheckTensorNanKernel(bufferCast(tensor), size, foundNanPtr, manager.getStream().get()); + manager.getStream().synchronize(); + return static_cast(foundNanPtr[0]); +} +} // namespace tensorrt_llm::runtime::utils diff --git a/cpp/tensorrt_llm/runtime/utils/debugUtils.h b/cpp/tensorrt_llm/runtime/utils/debugUtils.h new file mode 100644 index 0000000000..b8fa02e737 --- /dev/null +++ b/cpp/tensorrt_llm/runtime/utils/debugUtils.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "tensorrt_llm/runtime/bufferManager.h" +#include "tensorrt_llm/runtime/runtimeKernels.h" + +namespace tensorrt_llm::runtime +{ +namespace utils +{ + +bool tensorHasNan(const IBuffer& tensor, BufferManager& manager); + +} +} // namespace tensorrt_llm::runtime diff --git a/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp b/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp index c4144c295a..25455d9f14 100644 --- a/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp +++ b/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp @@ -89,6 +89,13 @@ void reshapeBufferVector(std::vector& vector, nvinfer1::Dims } } +std::vector sliceBufferVector( + std::vector const& vector, SizeType const offset, SizeType const size) +{ + return transformVector( + vector, [offset, size](auto const& buffer) { return std::shared_ptr{ITensor::slice(buffer, offset, size)}; }); +} + void insertTensorVector(StringPtrMap& map, std::string const& key, std::vector const& vec, SizeType const indexOffset) { diff --git a/cpp/tensorrt_llm/runtime/utils/sessionUtils.h b/cpp/tensorrt_llm/runtime/utils/sessionUtils.h index 80c18df375..8f9cb36a55 100644 --- a/cpp/tensorrt_llm/runtime/utils/sessionUtils.h +++ b/cpp/tensorrt_llm/runtime/utils/sessionUtils.h @@ -37,6 +37,16 @@ int initDevice(WorldConfig const& worldConfig); std::vector loadEngine(std::string const& enginePath); +template +auto transformVector(TInputContainer const& input, TFunc func) + -> std::vector> +{ + std::vector> output{}; + output.reserve(input.size()); + std::transform(input.begin(), input.end(), std::back_inserter(output), func); + return output; +} + std::vector createBufferVector(TllmRuntime const& runtime, SizeType indexOffset, SizeType numBuffers, std::string const& prefix, MemoryType memType); @@ -45,6 +55,9 @@ std::vector createBufferVector( void reshapeBufferVector(std::vector& vector, nvinfer1::Dims const& shape); +std::vector sliceBufferVector( + std::vector const& vector, SizeType offset, SizeType size); + void insertTensorVector(StringPtrMap& map, std::string const& key, std::vector const& vec, SizeType indexOffset); diff --git a/cpp/tensorrt_llm/thop/CMakeLists.txt b/cpp/tensorrt_llm/thop/CMakeLists.txt index d42520f2d9..234e605aff 100644 --- a/cpp/tensorrt_llm/thop/CMakeLists.txt +++ b/cpp/tensorrt_llm/thop/CMakeLists.txt @@ -21,6 +21,5 @@ target_link_libraries(th_utils PUBLIC ${TORCH_LIBRARIES} ${CUBLAS_LIB} add_library(th_common SHARED dynamicDecodeOp.cpp weightOnlyQuantOp.cpp gatherTreeOp.cpp fp8Op.cpp ncclCommunicatorOp.cpp) set_property(TARGET th_common PROPERTY POSITION_INDEPENDENT_CODE ON) -target_link_libraries( - th_common PRIVATE ${TORCH_LIBRARIES} th_utils ${Python3_LIBRARIES} - ${STATIC_TARGET} ${UNDEFINED_FLAG}) +target_link_libraries(th_common PRIVATE ${TORCH_LIBRARIES} th_utils + ${Python3_LIBRARIES} ${STATIC_TARGET}) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index f773960014..a3fa915c84 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -74,6 +74,7 @@ add_gtest(tllmBuffersTest runtime/tllmBuffersTest.cpp) add_gtest(bufferManagerTest runtime/bufferManagerTest.cpp) add_gtest(runtimeKernelTest runtime/runtimeKernelTest.cpp) add_gtest(samplingTest runtime/samplingTest.cpp) +add_gtest(iTensorTest runtime/iTensorTest.cpp) add_gtest(torchTest runtime/torchTest.cpp) set(SAMPLING_KERNEL_TEST_SRC kernels/sampling/samplingTest.cpp diff --git a/cpp/tests/README.md b/cpp/tests/README.md index 4777c513bb..4d45295d5e 100644 --- a/cpp/tests/README.md +++ b/cpp/tests/README.md @@ -36,7 +36,7 @@ To build the engines from the top-level directory: PYTHONPATH=examples/gpt:$PYTHONPATH python3 cpp/tests/resources/scripts/build_gpt_engines.py PYTHONPATH=examples/gptj:$PYTHONPATH python3 cpp/tests/resources/scripts/build_gptj_engines.py PYTHONPATH=examples/llama:$PYTHONPATH python3 cpp/tests/resources/scripts/build_llama_engines.py -PYTHONPATH=examples/CHATGLM6B:$PYTHONPATH python3 cpp/tests/resources/scripts/build_chatglm6b_engines.py +PYTHONPATH=examples/chatglm:$PYTHONPATH python3 cpp/tests/resources/scripts/build_chatglm_engines.py ``` It is possible to build engines with tensor and pipeline parallelism for LLaMA using 4 GPUs. @@ -53,8 +53,7 @@ End-to-end tests read inputs and expected outputs from Numpy files located at [c PYTHONPATH=examples/gpt:$PYTHONPATH python3 cpp/tests/resources/scripts/generate_expected_gpt_output.py PYTHONPATH=examples/gptj:$PYTHONPATH python3 cpp/tests/resources/scripts/generate_expected_gptj_output.py PYTHONPATH=examples/llama:$PYTHONPATH python3 cpp/tests/resources/scripts/generate_expected_llama_output.py -PYTHONPATH=examples/chatglm6b:$PYTHONPATH python3 cpp/tests/resources/scripts/generate_expected_chatglm6b_output.py -PYTHONPATH=examples/chatglm2-6b:$PYTHONPATH python3 cpp/tests/resources/scripts/generate_expected_chatglm2-6b_output.py +PYTHONPATH=examples/chatglm:$PYTHONPATH python3 cpp/tests/resources/scripts/generate_expected_chatglm_output.py ``` ### Generate data with tensor and pipeline parallelism diff --git a/cpp/tests/resources/scripts/build_chatglm6b_engines.py b/cpp/tests/resources/scripts/build_chatglm6b_engines.py deleted file mode 100755 index 4c20ad9ea1..0000000000 --- a/cpp/tests/resources/scripts/build_chatglm6b_engines.py +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse as _arg -import os as _os -import pathlib as _pl -import subprocess as _sp -import sys -import typing as _tp -from glob import glob as _glob - -import torch.multiprocessing as _mp - -resources_dir = _pl.Path( - __file__).parent.parent.parent.parent.parent / "examples/chatglm6b" -sys.path.insert(0, str(resources_dir)) - -engine_target_path = _pl.Path( - __file__).parent.parent / "models/rt_engine/chatglm6b" - -import build as _ecb - - -def build_engine(weight_dir: _pl.Path, engine_dir: _pl.Path, world_size, *args): - args = [ - '--log_level=error', - '--model_dir', - str(weight_dir), - '--output_dir', - str(engine_dir), - '--max_batch_size=2', - '--max_beam_width=2', - '--builder_opt=0', - f'--world_size={world_size}', - ] + list(args) - print("Running: " + " ".join(args)) - _ecb.run_build(args) - - -def run_command(command: _tp.Sequence[str], *, cwd=None, **kwargs) -> None: - - command = [str(i) for i in command] - print(f"Running: cd %s && %s" % - (str(cwd or _pl.Path.cwd()), " ".join(command))) - _sp.check_call(command, cwd=cwd, **kwargs) - - -def build_engines(model_cache: _tp.Optional[str] = None, world_size: int = 1): - - # Clone the model directory - hf_dir = resources_dir / "pyTorchModel" - trt_dir = resources_dir / "trtModel" - - run_command( - ["pip", "install", "-r", - str(resources_dir) + "/requirements.txt"], - cwd=resources_dir) - - if not _os.path.exists(hf_dir): - _os.mkdir(hf_dir) - - if len(_glob(str(hf_dir) + "/*")) == 0: - run_command( - [ - "git", - "clone", - "https://huggingface.co/THUDM/chatglm-6b", - hf_dir, - ], - cwd=resources_dir, - ) - - print("\nBuilding engine") - build_engine(hf_dir, trt_dir, world_size, "--dtype", "float16", - "--use_gpt_attention_plugin", "float16", "--use_gemm_plugin", - "float16") - - if not _os.path.exists(str(engine_target_path)): - _os.system(f"mkdir -p {str(engine_target_path)}") - - _os.system(f"cp -r {str(trt_dir) + '/*'} {engine_target_path}") - - print("Done.") - - -if __name__ == "__main__": - parser = _arg.ArgumentParser() - parser.add_argument("--model_cache", - type=str, - help="Directory where models are stored") - - parser.add_argument('--world_size', - type=int, - default=1, - help='world size, only support tensor parallelism now') - - _mp.set_start_method("spawn") - - build_engines(**vars(parser.parse_args())) diff --git a/cpp/tests/resources/scripts/build_chatglm2-6b_engines.py b/cpp/tests/resources/scripts/build_chatglm_engines.py similarity index 64% rename from cpp/tests/resources/scripts/build_chatglm2-6b_engines.py rename to cpp/tests/resources/scripts/build_chatglm_engines.py index 6a45b3e183..f3d50dcdf7 100755 --- a/cpp/tests/resources/scripts/build_chatglm2-6b_engines.py +++ b/cpp/tests/resources/scripts/build_chatglm_engines.py @@ -15,27 +15,31 @@ # limitations under the License. import argparse as _arg -import os as _os import pathlib as _pl +import shutil as _shutil import subprocess as _sp import sys import typing as _tp -from glob import glob as _glob +from collections import OrderedDict as _OrderedDict +from pathlib import Path as _Path import torch.multiprocessing as _mp resources_dir = _pl.Path( - __file__).parent.parent.parent.parent.parent / "examples/chatglm2-6b" + __file__).parent.parent.parent.parent.parent / "examples/chatglm" sys.path.insert(0, str(resources_dir)) engine_target_path = _pl.Path( - __file__).parent.parent / "models/rt_engine/chatglm2-6b" + __file__).parent.parent / "models/rt_engine/chatglm" import build as _ecb -def build_engine(weight_dir: _pl.Path, engine_dir: _pl.Path, world_size, *args): +def build_engine(model_version: str, weight_dir: _pl.Path, engine_dir: _pl.Path, + world_size, *args): args = [ + '-m', + str(model_version), '--log_level=error', '--model_dir', str(weight_dir), @@ -60,8 +64,14 @@ def run_command(command: _tp.Sequence[str], *, cwd=None, **kwargs) -> None: def build_engines(model_cache: _tp.Optional[str] = None, world_size: int = 1): - # Clone the model directory - hf_dir = resources_dir / "pyTorchModel" + model_name_dict = _OrderedDict([ + ["chatglm-6b", "1"], + ["chatglm2-6b", "2"], + ["chatglm3-6b", "3"], + ]) + hf_dir_list = [ + resources_dir / model_name for model_name in model_name_dict.keys() + ] trt_dir = resources_dir / "trtModel" run_command( @@ -69,29 +79,27 @@ def build_engines(model_cache: _tp.Optional[str] = None, world_size: int = 1): str(resources_dir) + "/requirements.txt"], cwd=resources_dir) - if not _os.path.exists(hf_dir): - _os.mkdir(hf_dir) + # Clone the model directory + for model_name, hf_dir in zip(model_name_dict.keys(), hf_dir_list): + if not _Path(hf_dir).exists(): + run_command( + [ + "git", + "clone", + "https://huggingface.co/THUDM/" + model_name, + ], + cwd=resources_dir, + ) - if len(_glob(str(hf_dir) + "/*")) == 0: - run_command( - [ - "git", - "clone", - "https://huggingface.co/THUDM/chatglm2-6b", - hf_dir, - ], - cwd=resources_dir, - ) + print("\nBuilding engines") + for model, hf_dir in zip(model_name_dict.items(), hf_dir_list): + print("Building %s" % model[0]) + build_engine(model[1], hf_dir, trt_dir, world_size) - print("\nBuilding engine") - build_engine(hf_dir, trt_dir, world_size, "--dtype", "float16", - "--use_gpt_attention_plugin", "float16", "--use_gemm_plugin", - "float16") - - if not _os.path.exists(str(engine_target_path)): - _os.system(f"mkdir -p {str(engine_target_path)}") - - _os.system(f"cp -r {str(trt_dir) + '/*'} {engine_target_path}") + if not _Path(engine_target_path).exists(): + _Path(engine_target_path).mkdir(parents=True, exist_ok=True) + for file in _Path(trt_dir).glob("*"): + _shutil.move(file, engine_target_path) print("Done.") diff --git a/cpp/tests/resources/scripts/generate_expected_chatglm2-6b_output.py b/cpp/tests/resources/scripts/generate_expected_chatglm2-6b_output.py deleted file mode 100755 index d3a109c313..0000000000 --- a/cpp/tests/resources/scripts/generate_expected_chatglm2-6b_output.py +++ /dev/null @@ -1,163 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -import pathlib as _pl -import sys -from pathlib import Path - -import numpy as np -import torch -import transformers - -import tensorrt_llm -from tensorrt_llm.quantization import QuantMode -from tensorrt_llm.runtime import GenerationSession, ModelConfig, SamplingConfig - -resources_dir = _pl.Path( - __file__).parent.parent.parent.parent.parent / "examples/chatglm2-6b" -sys.path.insert(0, str(resources_dir)) - -from run import parse_arguments # isort:skip - -from build import find_engines # isort:skip - -MODEL_NAME = "chatglm2-6b" - - -def generate(batch_size, beam_width): - - print("generate expected ChatGLM2-6B output BatchSize=%d, BeamWidth=%d" % - (batch_size, beam_width)) - args = parse_arguments() - if batch_size == 1: - args.input_text = args.input_text[:1] - elif batch_size > 2: - args.input_text += args.input_text[0] * (batch_size - 2) - args.beam_width = beam_width - args.tokenizer_dir = resources_dir / "pyTorchModel" - args.engine_dir = _pl.Path( - __file__).parent.parent / "models/rt_engine/chatglm2-6b" - - tensorrt_llm.logger.set_level(args.log_level) - - config_path = os.path.join(args.engine_dir, 'config.json') - with open(config_path, 'r') as f: - config = json.load(f) - assert (config['builder_config']['name'] == MODEL_NAME) - dtype = config['builder_config']['precision'] - end_id = config['builder_config']['eos_token_id'] - pad_id = config['builder_config']['pad_token_id'] - use_gpt_attention_plugin = config['plugin_config']['gpt_attention_plugin'] - world_size = config['builder_config']['tensor_parallel'] - assert world_size == tensorrt_llm.mpi_world_size( - ), f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})' - - runtime_rank = tensorrt_llm.mpi_rank() - runtime_mapping = tensorrt_llm.Mapping(world_size, - runtime_rank, - tp_size=world_size) - torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node) - - serialize_path = find_engines(Path(args.engine_dir), - dtype=dtype, - tp_size=world_size, - rank=runtime_rank)[0] - - tokenizer = transformers.AutoTokenizer.from_pretrained( - args.tokenizer_dir, trust_remote_code=True) - input_text = args.input_text - tokenized = tokenizer(input_text, - return_tensors="pt", - padding=True, - return_length=True) - input_ids = tokenized['input_ids'].int().contiguous().cuda() - input_lengths = tokenized['length'].int().contiguous().cuda() - - if use_gpt_attention_plugin: - # when using gpt attention plugin, inputs needs to align at the head - input_ids_padding_right = torch.zeros_like(input_ids) + end_id - for i, sample in enumerate(input_ids): - nPadding = 0 - for token in sample: - if token == pad_id: - nPadding += 1 - else: - break - input_ids_padding_right[ - i, :len(sample[nPadding:])] = sample[nPadding:] - input_ids = input_ids_padding_right - - model_config = ModelConfig( - vocab_size=config['builder_config']['vocab_size'], - num_layers=config['builder_config']['num_layers'], - num_heads=config['builder_config']['num_heads'] // world_size, - num_kv_heads=config['builder_config']['num_kv_heads'] // world_size, - hidden_size=config['builder_config']['hidden_size'] // world_size, - gpt_attention_plugin=use_gpt_attention_plugin, - remove_input_padding=config['builder_config']['remove_input_padding'], - model_name=MODEL_NAME, - paged_kv_cache=config['builder_config']['paged_kv_cache'], - quant_mode=QuantMode(config['builder_config']['quant_mode']), - dtype=dtype, - ) - - sampling_config = SamplingConfig( - end_id=end_id, - pad_id=pad_id, - num_beams=args.beam_width, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - ) - sampling_config.random_seed = args.random_seed - - with open(serialize_path, 'rb') as f: - engine_buffer = f.read() - decoder = GenerationSession( - model_config, - engine_buffer, - runtime_mapping, - ) - decoder.setup(input_ids.size(0), input_ids.size(1), args.max_output_len, - args.beam_width) - output_ids = decoder.decode(input_ids, input_lengths, sampling_config) - torch.cuda.synchronize() - - data_path = _pl.Path(__file__).parent.parent / "data/chatglm2-6b" - if not os.path.exists(str(data_path)): - os.mkdir(data_path) - nBS, nBM = input_ids.size(0), args.beam_width - np.save( - str(data_path) + "/inputId-BS%d-BM%d.npy" % (nBS, nBM), - input_ids.detach().cpu().numpy()) - outputId = output_ids.detach().cpu().numpy() - - nMaxOutputLength = 0 - for single_output in outputId.reshape(nBS * nBM, -1): - nMaxOutputLength = max(nMaxOutputLength, - np.min(np.where(single_output == end_id))) - np.save( - str(data_path) + "/outputId-BS%d-BM%d.npy" % (nBS, nBM), - outputId[:, :, :(nMaxOutputLength + 1)]) - - -if __name__ == '__main__': - generate(batch_size=1, beam_width=1) - generate(batch_size=2, beam_width=1) - generate(batch_size=1, beam_width=2) - print("Finish!") diff --git a/cpp/tests/resources/scripts/generate_expected_chatglm6b_output.py b/cpp/tests/resources/scripts/generate_expected_chatglm_output.py similarity index 70% rename from cpp/tests/resources/scripts/generate_expected_chatglm6b_output.py rename to cpp/tests/resources/scripts/generate_expected_chatglm_output.py index 523309d990..44c5920eff 100755 --- a/cpp/tests/resources/scripts/generate_expected_chatglm6b_output.py +++ b/cpp/tests/resources/scripts/generate_expected_chatglm_output.py @@ -15,9 +15,8 @@ # limitations under the License. import json -import os -import pathlib as _pl import sys +from collections import OrderedDict from pathlib import Path import numpy as np @@ -26,40 +25,45 @@ import transformers import tensorrt_llm from tensorrt_llm.quantization import QuantMode -from tensorrt_llm.runtime import (ChatGLM6BHeadModelGenerationSession, +from tensorrt_llm.runtime import (ChatGLMGenerationSession, GenerationSession, ModelConfig, SamplingConfig) -resources_dir = _pl.Path( - __file__).parent.parent.parent.parent.parent / "examples/chatglm6b" +resources_dir = Path( + __file__).parent.parent.parent.parent.parent / "examples/chatglm" sys.path.insert(0, str(resources_dir)) from run import parse_arguments # isort:skip from build import find_engines # isort:skip -MODEL_NAME = "chatglm-6b" +def generate(model_name, batch_size, beam_width): -def generate(batch_size, beam_width): + model_name_dict = OrderedDict([ + ["chatglm-6b", "1"], + ["chatglm2-6b", "2"], + ["chatglm3-6b", "3"], + ]) + + print("generate expected %s output BatchSize=%d, BeamWidth=%d" % + (model_name, batch_size, beam_width)) - print("generate expected ChatGLM-6B output BatchSize=%d, BeamWidth=%d" % - (batch_size, beam_width)) args = parse_arguments() if batch_size == 1: args.input_text = args.input_text[:1] elif batch_size > 2: args.input_text += args.input_text[0] * (batch_size - 2) + args.model_version = model_name_dict[model_name] args.beam_width = beam_width - args.tokenizer_dir = resources_dir / "pyTorchModel" - args.engine_dir = _pl.Path( - __file__).parent.parent / "models/rt_engine/chatglm6b" + args.tokenizer_dir = resources_dir / model_name + args.engine_dir = Path(__file__).parent.parent / "models/rt_engine/chatglm" tensorrt_llm.logger.set_level(args.log_level) - config_path = os.path.join(args.engine_dir, 'config.json') + config_path = Path(args.engine_dir) / (model_name + '-config.json') with open(config_path, 'r') as f: config = json.load(f) - assert (config['builder_config']['name'] == MODEL_NAME) + assert (config['builder_config']['name'] == model_name) dtype = config['builder_config']['precision'] end_id = config['builder_config']['eos_token_id'] pad_id = config['builder_config']['pad_token_id'] @@ -74,10 +78,13 @@ def generate(batch_size, beam_width): tp_size=world_size) torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node) - serialize_path = find_engines(Path(args.engine_dir), - dtype=dtype, - tp_size=world_size, - rank=runtime_rank)[0] + serialize_path = find_engines( + Path(args.engine_dir), + model_name=model_name, + dtype=dtype, + tp_size=world_size, + rank=runtime_rank, + )[0] tokenizer = transformers.AutoTokenizer.from_pretrained( args.tokenizer_dir, trust_remote_code=True) @@ -111,7 +118,7 @@ def generate(batch_size, beam_width): hidden_size=config['builder_config']['hidden_size'] // world_size, gpt_attention_plugin=use_gpt_attention_plugin, remove_input_padding=config['builder_config']['remove_input_padding'], - model_name=MODEL_NAME, + model_name=model_name, paged_kv_cache=config['builder_config']['paged_kv_cache'], quant_mode=QuantMode(config['builder_config']['quant_mode']), dtype=dtype, @@ -129,19 +136,25 @@ def generate(batch_size, beam_width): with open(serialize_path, 'rb') as f: engine_buffer = f.read() - decoder = ChatGLM6BHeadModelGenerationSession( - model_config, - engine_buffer, - runtime_mapping, - ) + if model_name == 'chatglm-6b': + decoder = ChatGLMGenerationSession( + model_config, + engine_buffer, + runtime_mapping, + ) + else: + decoder = GenerationSession( + model_config, + engine_buffer, + runtime_mapping, + ) decoder.setup(input_ids.size(0), input_ids.size(1), args.max_output_len, args.beam_width) output_ids = decoder.decode(input_ids, input_lengths, sampling_config) torch.cuda.synchronize() - data_path = _pl.Path(__file__).parent.parent / "data/chatglm6b" - if not os.path.exists(str(data_path)): - os.mkdir(data_path) + data_path = Path(__file__).parent.parent / "data" / model_name + data_path.mkdir(parents=True, exist_ok=True) nBS, nBM = input_ids.size(0), args.beam_width np.save( str(data_path) + "/inputId-BS%d-BM%d.npy" % (nBS, nBM), @@ -150,15 +163,23 @@ def generate(batch_size, beam_width): nMaxOutputLength = 0 for single_output in outputId.reshape(nBS * nBM, -1): - nMaxOutputLength = max(nMaxOutputLength, - np.min(np.where(single_output == end_id))) + if end_id in single_output: + nMaxOutputLength = max(nMaxOutputLength, + np.min(np.where(single_output == end_id))) + else: + nMaxOutputLength = len(single_output) np.save( str(data_path) + "/outputId-BS%d-BM%d.npy" % (nBS, nBM), outputId[:, :, :(nMaxOutputLength + 1)]) if __name__ == '__main__': - generate(batch_size=1, beam_width=1) - generate(batch_size=2, beam_width=1) - generate(batch_size=1, beam_width=2) - print("Finish!") + generate("chatglm-6b", batch_size=1, beam_width=1) + generate("chatglm-6b", batch_size=2, beam_width=1) + generate("chatglm2-6b", batch_size=1, beam_width=1) + generate("chatglm2-6b", batch_size=2, beam_width=1) + generate("chatglm2-6b", batch_size=1, beam_width=2) + generate("chatglm3-6b", batch_size=1, beam_width=1) + generate("chatglm3-6b", batch_size=2, beam_width=1) + generate("chatglm3-6b", batch_size=1, beam_width=2) + print("Done.") diff --git a/cpp/tests/resources/scripts/test_cpp.py b/cpp/tests/resources/scripts/test_cpp.py index be52da27c7..7e553ad21d 100755 --- a/cpp/tests/resources/scripts/test_cpp.py +++ b/cpp/tests/resources/scripts/test_cpp.py @@ -88,8 +88,7 @@ def run_tests(cuda_architectures: _tp.Optional[str] = None, model_cache: _tp.Optional[str] = None, skip_gptj=False, skip_llama=False, - skip_chatglm6b=False, - skip_chatglm2_6b=False, + skip_chatglm=False, only_fp8=False, only_multi_gpu=False, trt_root: _tp.Optional[str] = None) -> None: @@ -117,15 +116,13 @@ def run_tests(cuda_architectures: _tp.Optional[str] = None, model_cache=model_cache, skip_gptj=skip_gptj, skip_llama=skip_llama, - skip_chatglm6b=skip_chatglm6b, - skip_chatglm2_6b=skip_chatglm2_6b, + skip_chatglm=skip_chatglm, only_fp8=only_fp8) run_google_tests(build_dir=build_dir, skip_gptj=skip_gptj, skip_llama=skip_llama, - skip_chatglm6b=skip_chatglm6b, - skip_chatglm2_6b=skip_chatglm2_6b, + skip_chatglm=skip_chatglm, only_fp8=only_fp8) run_benchmarks(python_exe=python_exe, @@ -147,8 +144,7 @@ def prepare_all_model_tests(python_exe: str, model_cache: _tp.Optional[str] = None, skip_gptj=False, skip_llama=False, - skip_chatglm6b=False, - skip_chatglm2_6b=False, + skip_chatglm=False, only_fp8=False): model_cache_arg = ["--model_cache", model_cache] if model_cache else [] only_fp8_arg = ["--only_fp8"] if only_fp8 else [] @@ -178,21 +174,13 @@ def prepare_all_model_tests(python_exe: str, else: _log.info("Skipping Lllama tests") - if not skip_chatglm6b: - prepare_model_tests(model_name="chatglm6b", + if not skip_chatglm: + prepare_model_tests(model_name="chatglm", python_exe=python_exe, root_dir=root_dir, resources_dir=resources_dir) else: - _log.info("Skipping ChatGLM6B tests") - - if not skip_chatglm2_6b: - prepare_model_tests(model_name="chatglm2-6b", - python_exe=python_exe, - root_dir=root_dir, - resources_dir=resources_dir) - else: - _log.info("Skipping ChatGLM2-6B tests") + _log.info("Skipping ChatGLM tests") def prepare_multi_gpu_model_tests(python_exe: str, @@ -231,13 +219,17 @@ def prepare_model_tests(model_name: str, str(scripts_dir / f"generate_expected_{model_name}_output.py") ] + only_fp8_arg + only_multi_gpu_arg if only_multi_gpu_arg: - generate_expected_output = ["mpirun", "-n", "4" - ] + generate_expected_output + generate_expected_output = [ + "mpirun", + "-n", + "4", + "--allow-run-as-root", + ] + generate_expected_output run_command(generate_expected_output, cwd=root_dir, env=model_env) -def run_google_tests(build_dir: _pl.Path, skip_gptj, skip_llama, skip_chatglm6b, - skip_chatglm2_6b, only_fp8): +def run_google_tests(build_dir: _pl.Path, skip_gptj, skip_llama, skip_chatglm, + only_fp8): make_google_tests = [ "cmake", "--build", ".", "--config", "Release", "-j", "--target", "google-tests" @@ -245,16 +237,14 @@ def run_google_tests(build_dir: _pl.Path, skip_gptj, skip_llama, skip_chatglm6b, run_command(make_google_tests, cwd=build_dir) cpp_env = {**_os.environ} - ctest = ["ctest", "--output-on-failure", "--output-junit", "report.xml"] + ctest = ["ctest", "--output-on-failure", "--output-junit", "results.xml"] excluded_tests = [] if skip_gptj: excluded_tests.append(".*Gptj.*") if skip_llama: excluded_tests.append(".*Llama.*") - if skip_chatglm6b: - excluded_tests.append(".*Glm6.*") - if skip_chatglm2_6b: - excluded_tests.append(".*Glm2_6.*") + if skip_chatglm: + excluded_tests.append(".*ChatGlm.*") if only_fp8: ctest.extend(["-R", ".*FP8.*"]) else: @@ -274,7 +264,8 @@ def run_multi_gpu_tests(build_dir: _pl.Path): tests_dir = build_dir / "tests" cpp_env = {**_os.environ} session_test = [ - "mpirun", "-n", "4", "gptSessionTest", "--gtest_filter=*TP*:*PP*" + "mpirun", "-n", "4", "--allow-run-as-root", "gptSessionTest", + "--gtest_filter=*TP*:*PP*" ] run_command(session_test, cwd=tests_dir, env=cpp_env) @@ -358,12 +349,9 @@ if __name__ == "__main__": parser.add_argument("--skip_llama", action="store_true", help="Skip the tests for Llama") - parser.add_argument("--skip_chatglm6b", + parser.add_argument("--skip_chatglm", action="store_true", - help="Skip the tests for ChatGLM6B") - parser.add_argument("--skip_chatglm2_6b", - action="store_true", - help="Skip the tests for ChatGLM2-6B") + help="Skip the tests for ChatGLM") parser.add_argument( "--only_fp8", action="store_true", diff --git a/cpp/tests/runtime/gptSessionTest.cpp b/cpp/tests/runtime/gptSessionTest.cpp index ac1a679ad7..01b53a3894 100644 --- a/cpp/tests/runtime/gptSessionTest.cpp +++ b/cpp/tests/runtime/gptSessionTest.cpp @@ -148,6 +148,12 @@ public: int mTPSize; bool mRandomEndId; }; + +struct MicroBatchSizes +{ + std::optional ctxMicroBatchSize{std::nullopt}; + std::optional genMicroBatchSize{std::nullopt}; +}; } // namespace class SessionTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-type-member-init) @@ -183,7 +189,7 @@ void verifyModelConfig(GptModelConfig const& modelConfig, ModelSpec const& model void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, ModelIds const modelIds, SizeType beamWidth, std::initializer_list const& batchSizes, fs::path const& resultsFile, - std::shared_ptr const& logger, bool cudaGraphMode, SizeType numMicroBatches) + std::shared_ptr const& logger, bool cudaGraphMode, MicroBatchSizes microBatchSizes) { auto manager = BufferManager(std::make_shared()); @@ -275,7 +281,8 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, Model auto const maxBatchSize = *std::max_element(batchSizes.begin(), batchSizes.end()); GptSession::Config sessionConfig{maxBatchSize, beamWidth, maxSeqLength}; sessionConfig.decoderPerRequest = modelSpec.mDecoderPerRequest; - sessionConfig.numMicroBatches = numMicroBatches; + sessionConfig.ctxMicroBatchSize = microBatchSizes.ctxMicroBatchSize; + sessionConfig.genMicroBatchSize = microBatchSizes.genMicroBatchSize; sessionConfig.cudaGraphMode = cudaGraphMode; GptSession session{sessionConfig, modelConfig, worldConfig, enginePath.string(), logger}; @@ -327,6 +334,7 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, Model GenerationInput generationInput{ endId, padId, std::move(inputIds), std::move(inputLenghts), modelConfig.usePackedInput()}; + generationInput.maxNewTokens = maxNewTokens; // runtime will allocate memory for output if this tensor is empty GenerationOutput generationOutput{bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32), @@ -338,11 +346,19 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, Model { SizeType numSteps = 0; generationOutput.onTokenGenerated - = [&numSteps, &modelSpec, maxNewTokens]([[maybe_unused]] GenerationOutput::TensorPtr const& outputIds, - [[maybe_unused]] SizeType step, bool finished) + = [&numSteps, &modelSpec, maxNewTokens]( + [[maybe_unused]] GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished) { + // check that we execute the callback in each step + EXPECT_EQ(step, numSteps); ++numSteps; - EXPECT_TRUE(!finished || modelSpec.mRandomEndId || numSteps == maxNewTokens); + if (!modelSpec.mRandomEndId) + { + // check that we only finish after producing `maxNewTokens` tokens + EXPECT_TRUE(!finished || numSteps == maxNewTokens); + } + // check that `finished` is set to true after producing `maxNewTokens` tokens + EXPECT_TRUE(numSteps != maxNewTokens || finished); }; session.generate(generationOutput, generationInput, samplingConfig); @@ -416,7 +432,7 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, Model auto constexpr kBatchSizes = {1, 8}; -using ParamType = std::tuple; +using ParamType = std::tuple; std::string generateTestName(const testing::TestParamInfo& info) { @@ -434,9 +450,11 @@ std::string generateTestName(const testing::TestParamInfo& info) name.append("DecoderBatch"); if (std::get<3>(info.param)) name.append("CudaGraph"); - auto const numMicroBatches = std::get<4>(info.param); - if (numMicroBatches > 1) - name.append("MicroBatch" + std::to_string(numMicroBatches)); + auto const microBatcheSizes = std::get<4>(info.param); + if (microBatcheSizes.ctxMicroBatchSize) + name.append("CBS" + std::to_string(microBatcheSizes.ctxMicroBatchSize.value())); + if (microBatcheSizes.genMicroBatchSize) + name.append("GBS" + std::to_string(microBatcheSizes.genMicroBatchSize.value())); if (modelSpec.mPPSize > 1) name.append("PP" + std::to_string(modelSpec.mPPSize)); if (modelSpec.mTPSize > 1) @@ -458,10 +476,8 @@ TEST_P(ParamTest, Test) auto const modelIds = modelParams.ids; auto const modelSpec = std::get<1>(GetParam()); SizeType const beamWidth{std::get<2>(GetParam())}; - auto const resultsPath - = DATA_PATH / modelDir / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth)); - fs::path const resultsFile{resultsPath / modelSpec.mResultsFile}; - auto const numMicroBatches = std::get<4>(GetParam()); + auto const cudaGraphMode = std::get<3>(GetParam()); + auto const microBatchSizes = std::get<4>(GetParam()); if (!modelSpec.mUseGptAttentionPlugin && beamWidth > 1) GTEST_SKIP(); @@ -485,10 +501,12 @@ TEST_P(ParamTest, Test) std::ostringstream gpuSizePath; gpuSizePath << "tp" << modelSpec.mTPSize << "-pp" << modelSpec.mPPSize << "-gpu"; auto const modelPath{ENGINGE_PATH / modelDir / modelSpec.mModelPath / gpuSizePath.str()}; - auto const cudaGraphMode = std::get<3>(GetParam()); + auto const resultsPath + = DATA_PATH / modelDir / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth)); + fs::path const resultsFile{resultsPath / modelSpec.mResultsFile}; testGptSession( - modelPath, modelSpec, modelIds, beamWidth, kBatchSizes, resultsFile, mLogger, cudaGraphMode, numMicroBatches); + modelPath, modelSpec, modelIds, beamWidth, kBatchSizes, resultsFile, mLogger, cudaGraphMode, microBatchSizes); } INSTANTIATE_TEST_SUITE_P(GptSessionTest, ParamTest, @@ -535,7 +553,8 @@ INSTANTIATE_TEST_SUITE_P(GptSessionTest, ParamTest, .usePagedKvCache() .useDecoderPerRequest() .useRandomEndId()), - testing::Values(1, 2), testing::Values(false, true), testing::Values(1, 3)), + testing::Values(1, 2), testing::Values(false, true), + testing::Values(MicroBatchSizes(), MicroBatchSizes{3, 3}, MicroBatchSizes{3, 6})), generateTestName); INSTANTIATE_TEST_SUITE_P(GptjSessionTest, ParamTest, @@ -568,7 +587,7 @@ INSTANTIATE_TEST_SUITE_P(GptjSessionTest, ParamTest, .useDecoderPerRequest() ), - testing::Values(1, 2), testing::Values(false), testing::Values(1)), + testing::Values(1, 2), testing::Values(false), testing::Values(MicroBatchSizes())), generateTestName); INSTANTIATE_TEST_SUITE_P(LlamaSessionTest, ParamTest, @@ -611,7 +630,7 @@ INSTANTIATE_TEST_SUITE_P(LlamaSessionTest, ParamTest, .useTensorParallelism(2) ), - testing::Values(1, 2), testing::Values(false), testing::Values(1)), + testing::Values(1, 2), testing::Values(false), testing::Values(MicroBatchSizes())), generateTestName); class LlamaSessionOnDemandTest : public SessionTest @@ -632,7 +651,8 @@ TEST_F(LlamaSessionOnDemandTest, SamplingFP16WithAttentionPlugin) auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin(); auto const modeIds = ModelIds{2, 2}; - testGptSession(modelPath, modelSpec, modeIds, beamWidth, batchSizes, resultsFile, mLogger, false, 1); + testGptSession( + modelPath, modelSpec, modeIds, beamWidth, batchSizes, resultsFile, mLogger, false, MicroBatchSizes()); } TEST_F(LlamaSessionOnDemandTest, SamplingFP16AttentionPluginDecoderBatch) @@ -648,28 +668,34 @@ TEST_F(LlamaSessionOnDemandTest, SamplingFP16AttentionPluginDecoderBatch) auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin().usePackedInput().useDecoderPerRequest(); auto const modeIds = ModelIds{2, 2}; - testGptSession(modelPath, modelSpec, modeIds, beamWidth, batchSizes, resultsFile, mLogger, false, 1); + testGptSession( + modelPath, modelSpec, modeIds, beamWidth, batchSizes, resultsFile, mLogger, false, MicroBatchSizes()); } -class Glm6bSessionTest : public SessionTest +class ChatGlmSessionTest : public SessionTest // for ChatGLM-6B { }; -class Glm2_6bSessionTest : public SessionTest +class ChatGlm2SessionTest : public SessionTest // for ChatGLM2-6B and ChatGLM2-6B-32k { }; -// Engines need to be generated using cpp/tests/resources/scripts/build_gpt_engines.py. -// Expected outputs need to be generated using cpp/tests/resources/scripts/generate_expected_gpt_output.py. +class ChatGlm3SessionTest : public SessionTest // for ChatGLM3-6B and ChatGLM3-6B-32k +{ +}; + +// Engines need to be generated using cpp/tests/resources/scripts/build_chatglm_engines.py. +// Expected outputs need to be generated using cpp/tests/resources/scripts/generate_expected_chatglm_output.py. namespace { // TODO: consolidate this function with testGptSession -// Notice: both ChatGLM-6B and ChatGLM2-6B use this function, which are different at GptModelConfig::ModelVariant -void testGlm6bSession(fs::path const& modelPath, std::string const& modelName, ModelSpec const& modelSpec, +// Notice: all ChatGLM models (ChatGLM-6B, ChatGLM2-6B, ChatGLM3-6B, ChatGLM2-6B-32k and ChatGLM3-6B-32k) use this +// function The differences are GptModelConfig::ModelVariant +void testChatGlmSession(fs::path const& modelPath, std::string const& modelName, ModelSpec const& modelSpec, ModelIds const modelIds, SizeType beamWidth, std::initializer_list const& batchSizes, - std::shared_ptr const& logger, bool cudaGraphMode, SizeType numMicroBatches) + std::shared_ptr const& logger, bool cudaGraphMode, MicroBatchSizes microBatchSizes) { auto manager = BufferManager(std::make_shared()); @@ -692,7 +718,7 @@ void testGlm6bSession(fs::path const& modelPath, std::string const& modelName, M auto const expectedOutputData = bufferCast(*expectedOutput); ASSERT_TRUE(fs::exists(modelPath)); - auto const json = GptJsonConfig::parse(modelPath / "config.json"); + auto const json = GptJsonConfig::parse(modelPath / (modelName + "-config.json")); auto const modelConfig = json.getModelConfig(); verifyModelConfig(modelConfig, modelSpec); auto const decoderPerRequest = modelSpec.mDecoderPerRequest; @@ -728,9 +754,9 @@ void testGlm6bSession(fs::path const& modelPath, std::string const& modelName, M auto const maxBatchSize = *std::max_element(batchSizes.begin(), batchSizes.end()); GptSession::Config sessionConfig{maxBatchSize, beamWidth, maxSeqLength}; sessionConfig.decoderPerRequest = decoderPerRequest; - sessionConfig.numMicroBatches = numMicroBatches; + sessionConfig.ctxMicroBatchSize = microBatchSizes.ctxMicroBatchSize; + sessionConfig.genMicroBatchSize = microBatchSizes.genMicroBatchSize; sessionConfig.cudaGraphMode = cudaGraphMode; - GptSession session{sessionConfig, modelConfig, worldConfig, enginePath.string(), logger}; EXPECT_EQ(session.getDevice(), worldConfig.getDevice()); // Use bufferManager for copying data to and from the GPU @@ -837,62 +863,74 @@ void testGlm6bSession(fs::path const& modelPath, std::string const& modelName, M } // namespace -TEST_F(Glm6bSessionTest, SamplingFP16WithGptAttentionPluginBS1BM1) +TEST_F(ChatGlmSessionTest, SamplingFP16WithGptAttentionPluginBS1BM1) { - auto const modelName{"chatglm6b"}; - auto const modelPath{ENGINGE_PATH / modelName}; + auto const modelName{"chatglm-6b"}; + auto const modelPath{ENGINGE_PATH / "chatglm"}; auto const batchSizes = {1}; auto constexpr dtype = nvinfer1::DataType::kHALF; auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin(); auto const modeIds = ModelIds{130005, 130005}; - testGlm6bSession(modelPath, modelName, modelSpec, modeIds, 1, batchSizes, mLogger, false, 1); + testChatGlmSession(modelPath, modelName, modelSpec, modeIds, 1, batchSizes, mLogger, false, MicroBatchSizes()); } -TEST_F(Glm6bSessionTest, SamplingFP16WithGptAttentionPluginBS2BM1) +TEST_F(ChatGlmSessionTest, SamplingFP16WithGptAttentionPluginBS2BM1) { - auto const modelName{"chatglm6b"}; - auto const modelPath{ENGINGE_PATH / modelName}; + auto const modelName{"chatglm-6b"}; + auto const modelPath{ENGINGE_PATH / "chatglm"}; auto const batchSizes = {2}; auto constexpr dtype = nvinfer1::DataType::kHALF; auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin(); auto const modeIds = ModelIds{130005, 130005}; - testGlm6bSession(modelPath, modelName, modelSpec, modeIds, 1, batchSizes, mLogger, false, 1); + testChatGlmSession(modelPath, modelName, modelSpec, modeIds, 1, batchSizes, mLogger, false, MicroBatchSizes()); } -TEST_F(Glm2_6bSessionTest, SamplingFP16WithGptAttentionPluginBS1BM1) +TEST_F(ChatGlm2SessionTest, SamplingFP16WithGptAttentionPluginBS1BM1) { auto const modelName{"chatglm2-6b"}; - auto const modelPath{ENGINGE_PATH / modelName}; + auto const modelPath{ENGINGE_PATH / "chatglm"}; auto const batchSizes = {1}; auto constexpr dtype = nvinfer1::DataType::kHALF; auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin(); auto const modeIds = ModelIds{2, 2}; - testGlm6bSession(modelPath, modelName, modelSpec, modeIds, 1, batchSizes, mLogger, false, 1); + testChatGlmSession(modelPath, modelName, modelSpec, modeIds, 1, batchSizes, mLogger, false, MicroBatchSizes()); } -TEST_F(Glm2_6bSessionTest, SamplingFP16WithGptAttentionPluginBS2BM1) +TEST_F(ChatGlm2SessionTest, SamplingFP16WithGptAttentionPluginBS2BM1) { auto const modelName{"chatglm2-6b"}; - auto const modelPath{ENGINGE_PATH / modelName}; + auto const modelPath{ENGINGE_PATH / "chatglm"}; auto const batchSizes = {2}; auto constexpr dtype = nvinfer1::DataType::kHALF; auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin(); auto const modeIds = ModelIds{2, 2}; - testGlm6bSession(modelPath, modelName, modelSpec, modeIds, 1, batchSizes, mLogger, false, 1); + testChatGlmSession(modelPath, modelName, modelSpec, modeIds, 1, batchSizes, mLogger, false, MicroBatchSizes()); } -TEST_F(Glm2_6bSessionTest, SamplingFP16WithGptAttentionPluginBS1BM2) +TEST_F(ChatGlm2SessionTest, SamplingFP16WithGptAttentionPluginBS1BM2) { auto const modelName{"chatglm2-6b"}; - auto const modelPath{ENGINGE_PATH / modelName}; + auto const modelPath{ENGINGE_PATH / "chatglm"}; auto const batchSizes = {1}; auto constexpr dtype = nvinfer1::DataType::kHALF; auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin(); auto const modeIds = ModelIds{2, 2}; - testGlm6bSession(modelPath, modelName, modelSpec, modeIds, 2, batchSizes, mLogger, false, 1); + testChatGlmSession(modelPath, modelName, modelSpec, modeIds, 2, batchSizes, mLogger, false, MicroBatchSizes()); +} + +TEST_F(ChatGlm3SessionTest, SamplingFP16WithGptAttentionPluginBS1BM1) +{ + auto const modelName{"chatglm3-6b"}; + auto const modelPath{ENGINGE_PATH / "chatglm"}; + auto const batchSizes = {1}; + auto constexpr dtype = nvinfer1::DataType::kHALF; + auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin(); + auto const modeIds = ModelIds{2, 2}; + + testChatGlmSession(modelPath, modelName, modelSpec, modeIds, 1, batchSizes, mLogger, false, MicroBatchSizes()); } diff --git a/cpp/tests/runtime/iTensorTest.cpp b/cpp/tests/runtime/iTensorTest.cpp new file mode 100644 index 0000000000..c2be7b5e50 --- /dev/null +++ b/cpp/tests/runtime/iTensorTest.cpp @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "tensorrt_llm/runtime/bufferManager.h" +#include "tensorrt_llm/runtime/iTensor.h" + +using namespace tensorrt_llm::runtime; +using namespace ::testing; + +namespace +{ + +TEST(iTensorTest, UnsqueezeShape) +{ + auto oldShape = ITensor::makeShape({2, 3, 4, 5}); + { + auto shape = ITensor::unsqueeze(oldShape, 0); + + EXPECT_EQ(shape.nbDims, 5); + EXPECT_EQ(shape.d[0], 1); + EXPECT_EQ(shape.d[1], 2); + EXPECT_EQ(shape.d[2], 3); + EXPECT_EQ(shape.d[3], 4); + EXPECT_EQ(shape.d[4], 5); + } + { + auto shape = ITensor::unsqueeze(oldShape, 1); + + EXPECT_EQ(shape.nbDims, 5); + EXPECT_EQ(shape.d[0], 2); + EXPECT_EQ(shape.d[1], 1); + EXPECT_EQ(shape.d[2], 3); + EXPECT_EQ(shape.d[3], 4); + EXPECT_EQ(shape.d[4], 5); + } + + { + auto shape = ITensor::unsqueeze(oldShape, 4); + + EXPECT_EQ(shape.nbDims, 5); + EXPECT_EQ(shape.d[0], 2); + EXPECT_EQ(shape.d[1], 3); + EXPECT_EQ(shape.d[2], 4); + EXPECT_EQ(shape.d[3], 5); + EXPECT_EQ(shape.d[4], 1); + } + + std::vector invalidDims{-1, 5, 10}; + for (auto invalidDim : invalidDims) + { + try + { + auto shape = ITensor::unsqueeze(oldShape, invalidDim); + FAIL() << "Expected failure"; + } + catch (tensorrt_llm::common::TllmException const& e) + { + EXPECT_THAT(e.what(), testing::HasSubstr("Invalid dim")); + } + catch (...) + { + FAIL() << "Expected TllmException"; + } + } +} + +TEST(iTensorTest, UnsqueezeTensor) +{ + auto oldShape = ITensor::makeShape({2, 3, 4, 5}); + BufferManager manager(std::make_shared()); + + { + auto tensor = manager.cpu(oldShape, nvinfer1::DataType::kINT32); + tensor->unsqueeze(0); + auto shape = tensor->getShape(); + + EXPECT_EQ(shape.nbDims, 5); + EXPECT_EQ(shape.d[0], 1); + EXPECT_EQ(shape.d[1], 2); + EXPECT_EQ(shape.d[2], 3); + EXPECT_EQ(shape.d[3], 4); + EXPECT_EQ(shape.d[4], 5); + } + { + auto tensor = manager.cpu(oldShape, nvinfer1::DataType::kINT32); + tensor->unsqueeze(1); + auto shape = tensor->getShape(); + + EXPECT_EQ(shape.nbDims, 5); + EXPECT_EQ(shape.d[0], 2); + EXPECT_EQ(shape.d[1], 1); + EXPECT_EQ(shape.d[2], 3); + EXPECT_EQ(shape.d[3], 4); + EXPECT_EQ(shape.d[4], 5); + } + + { + auto tensor = manager.cpu(oldShape, nvinfer1::DataType::kINT32); + tensor->unsqueeze(4); + auto shape = tensor->getShape(); + + EXPECT_EQ(shape.nbDims, 5); + EXPECT_EQ(shape.d[0], 2); + EXPECT_EQ(shape.d[1], 3); + EXPECT_EQ(shape.d[2], 4); + EXPECT_EQ(shape.d[3], 5); + EXPECT_EQ(shape.d[4], 1); + } + + std::vector invalidDims{-1, 5, 10}; + for (auto invalidDim : invalidDims) + { + try + { + auto tensor = manager.cpu(oldShape, nvinfer1::DataType::kINT32); + tensor->unsqueeze(invalidDim); + FAIL() << "Expected failure"; + } + catch (tensorrt_llm::common::TllmException const& e) + { + EXPECT_THAT(e.what(), testing::HasSubstr("Invalid dim")); + } + catch (...) + { + FAIL() << "Expected TllmException"; + } + } +} + +} // namespace diff --git a/docs/source/batch_manager.md b/docs/source/batch_manager.md index 7a54f18d25..aac340aef3 100644 --- a/docs/source/batch_manager.md +++ b/docs/source/batch_manager.md @@ -94,17 +94,29 @@ The statistics are packaged as a JSON string. That string contains the following * `Active Request Count`, the number of active requests in batch manager * `Max Request Count`, the max number of requests batch manager can support at a time -When using in-flight batching, the following additional statistics are reported: +When using paged KV cache, following statistics are reported: * `Max KV cache blocks`, the maximum number of KV cache blocks per GPU * `Free KV cache blocks`, number of free KV cache blocks per GPU * `Used KV cache blocks`, number of used KV cache blocks per GPU * `Tokens per KV cache block`, number of tokens per KV cache block * `Scheduled Requests`, number of requests scheduled this iteration + +When using in-flight batching, the following additional statistics are reported per step/iteration: + + * `Scheduled Requests`, number of total requests scheduled * `Context Requests`, number of requests in Context phase - * `Total Context Tokens`, total number of tokens across requests in context phase - * `Generation Requests`, number of requests in Context phase * `Generation Requests`, number of requests in Generation phase - * `MicroBatch ID`, number of requests in Generation phase + * `Total Context Tokens`, total number of tokens across requests in context phase + * `MicroBatch ID`, micro batch ID + +When using V1 batching, the following additional statistics are reported per V1 iteration: + + * `Scheduled Requests`, number of total requests scheduled + * `Context Requests`, number of requests in Context phase + * `Total Generation Tokens`, Total number of tokens generated + * `Total Context Tokens`, total number of tokens across requests in context phase + * `Empty Generation Slots`, total number of padded Slots during generation phase + ### GptManager Design diff --git a/docs/source/gpt_runtime.md b/docs/source/gpt_runtime.md index 997249c3fa..035ed77aec 100644 --- a/docs/source/gpt_runtime.md +++ b/docs/source/gpt_runtime.md @@ -266,7 +266,7 @@ second one contains `[9, 2]` and the third one is composed of tokens `[6, 2, 4, 1]`. In total, there are 9 tokens. That's the length. The shape of the tensor is `[2, 9]`. The first row of the tensor must contain the 9 token IDs and the second row must store the -[exclusive prefix-sum](https://en.wikipedia.org/wiki/Prefix_sum) +[inclusive prefix-sum](https://en.wikipedia.org/wiki/Prefix_sum) of the word lengths as shown on the following diagram: ``` @@ -274,7 +274,7 @@ of the word lengths as shown on the following diagram: | | | | V V V V [ 5, 7, 3, 9, 2, 6, 2, 4, 1] -[ 0, 3, 5, 9, -1, -1, -1, -1, -1] +[ 3, 5, 9, -1, -1, -1, -1, -1, -1] ``` In case all the words are made of a single token, the inner-most dimension of diff --git a/docs/source/precision.md b/docs/source/precision.md index 5877e78076..86d193fe10 100644 --- a/docs/source/precision.md +++ b/docs/source/precision.md @@ -114,23 +114,26 @@ GPT-J and LLaMA. Those examples can be found in This release of TensorRT-LLM contains the following examples: -| Model | FP32 | FP16 | BF16 | FP8 | W8A8 SQ | W8A16 | W4A16 | W4A16 AWQ | W4A16 GPTQ | -| :-------------------------- | :--: | :--: | :--: | :--: | :-----: | :---: | :---: | :-------: | :--------: | -| Baichuan | Y | Y | Y | . | . | Y | Y | . | . | -| BERT | Y | Y | Y | . | . | . | . | . | . | -| BLOOM | Y | Y | Y | . | Y | Y | Y | . | . | -| ChatGLM | Y | Y | Y | . | . | . | . | . | . | -| ChatGLM-v2 | Y | Y | Y | . | . | . | . | . | . | -| Falcon | Y | Y | Y | . | . | . | . | . | . | -| GPT | Y | Y | Y | Y | Y | Y | Y | . | . | -| GPT-J | Y | Y | Y | Y | Y | Y | Y | Y | . | -| GPT-NeMo | Y | Y | Y | . | . | . | . | . | . | -| GPT-NeoX | Y | Y | Y | . | . | . | . | . | Y | -| LLaMA | Y | Y | Y | . | Y | Y | Y | Y | Y | -| LLaMA-v2 | Y | Y | Y | Y | Y | Y | Y | Y | Y | -| OPT | Y | Y | Y | . | . | . | . | . | . | -| SantaCoder | Y | Y | Y | . | . | . | . | . | . | -| StarCoder | Y | Y | Y | . | . | . | . | . | . | +| Model | FP32 | FP16 | BF16 | FP8 | W8A8 SQ | W8A16 | W4A16 | W4A16 AWQ | W4A16 GPTQ | +| :--------- | :---: | :---: | :---: | :---: | :-----: | :---: | :---: | :-------: | :--------: | +| Baichuan | Y | Y | Y | . | Y | Y | Y | . | . | +| BERT | Y | Y | Y | . | . | . | . | . | . | +| BLOOM | Y | Y | Y | . | Y | Y | Y | . | . | +| ChatGLM | Y | Y | Y | . | . | . | . | . | . | +| ChatGLM-v2 | Y | Y | Y | . | . | . | . | . | . | +| ChatGLM-v3 | Y | Y | Y | . | . | . | . | . | . | +| Falcon | Y | Y | Y | . | . | . | . | . | . | +| GPT | Y | Y | Y | Y | Y | Y | Y | . | . | +| GPT-J | Y | Y | Y | Y | Y | Y | Y | Y | . | +| GPT-NeMo | Y | Y | Y | . | . | . | . | . | . | +| GPT-NeoX | Y | Y | Y | . | . | . | . | . | Y | +| LLaMA | Y | Y | Y | . | Y | Y | Y | Y | Y | +| LLaMA-v2 | Y | Y | Y | Y | Y | Y | Y | Y | Y | +| OPT | Y | Y | Y | . | . | . | . | . | . | +| SantaCoder | Y | Y | Y | . | . | . | . | . | . | +| StarCoder | Y | Y | Y | . | . | . | . | . | . | +| InternLM | Y | Y | Y | . | Y | Y | Y | . | . | + ## Technical Detail: The `QuantMode` Flags diff --git a/examples/baichuan/README.md b/examples/baichuan/README.md index 82e0b87692..241bb540a3 100644 --- a/examples/baichuan/README.md +++ b/examples/baichuan/README.md @@ -16,6 +16,8 @@ These scripts accept an argument named model_version, whose value should be `v1_ * FP16 * BF16 * INT4 & INT8 Weight-Only + * INT8 KV CACHE + * INT8 Smooth Quant ## Usage @@ -82,6 +84,74 @@ python build.py --model_version v1_13b \ --world_size 2 ``` +#### INT8 weight only + INT8 KV cache +For INT8 KV cache, [`hf_baichuan_convert.py`](./hf_baichuan_convert.py) features a +`--calibrate-kv-cache, -kv` option. Setting `-kv` will calibrate the model, +and then export the scaling factors needed for INT8 KV cache inference. + + +Example: + +```bash +python3 hf_baichuan_convert.py -i baichuan-inc/Baichuan-13B-Chat -o ./tmp/baichuan_v1_13b/int8_kv_cache/ --calibrate-kv-cache -t fp16 +``` + +[`build.py`](./build.py) add new options for the support of INT8 KV cache. + +`--int8_kv_cache` is the command-line option to enable INT8 KV cache. + +In addition, it could be combined with INT8 weight-only quantization, as follows: + +Examples of INT8 weight-only quantization + INT8 KV cache + +```bash +# Build model with both INT8 weight-only and INT8 KV cache enabled +python build.py --model_version v1_13b \ + --bin_model_dir=./tmp/baichuan_v1_13b/int8_kv_cache/1-gpu/ \ + --dtype float16 \ + --use_gpt_attention_plugin float16 \ + --use_gemm_plugin float16 \ + --output_dir ./tmp/baichuan_v1_13b/trt_engines/int8_kv_cache_weight_only/1-gpu \ + --int8_kv_cache \ + --use_weight_only +``` + +#### SmoothQuant + +The SmoothQuant supports all Baichuan model variants. Unlike the FP16 build where the HF weights are processed and loaded into the TensorRT-LLM directly, the SmoothQuant needs to load INT8 weights which should be pre-processed before building an engine. + +Example: +```bash +python3 hf_baichuan_convert.py -i baichuan-inc/Baichuan-13B-Chat -o ./tmp/baichuan_v1_13b/sq0.8/ -sq 0.8 --tensor-parallelism 1 --storage-type fp16 +``` + +[`build.py`](./build.py) add new options for the support of INT8 inference of SmoothQuant models. + +`--use_smooth_quant` is the starting point of INT8 inference. By default, it +will run the model in the _per-tensor_ mode. + +Then, you can add any combination of `--per-token` and `--per-channel` to get the corresponding behaviors. + +Examples of build invocations: + +```bash +# Build model for SmoothQuant in the _per_tensor_ mode. +python3 build.py --model_version v1_13b \ + --bin_model_dir=./tmp/baichuan_v1_13b/sq0.8/1-gpu/ \ + --use_smooth_quant \ + --use_gpt_attention_plugin float16 \ + +# Build model for SmoothQuant in the _per_token_ + _per_channel_ mode +python3 build.py --model_version v1_13b \ + --bin_model_dir=./tmp/baichuan_v1_13b/sq0.8/1-gpu/ \ + --use_smooth_quant \ + --use_gpt_attention_plugin float16 \ + --per_token \ + --per_channel +``` + +Note we use `--bin_model_dir` instead of `--model_dir` and `--meta_ckpt_dir` since SmoothQuant model needs INT8 weights and various scales from the binary files. + ### Run To run a TensorRT-LLM Baichuan model using the engines generated by build.py diff --git a/examples/baichuan/build.py b/examples/baichuan/build.py index 78e9eca2de..45c7f483fe 100644 --- a/examples/baichuan/build.py +++ b/examples/baichuan/build.py @@ -15,6 +15,7 @@ import argparse import os import time +from pathlib import Path import onnx import tensorrt as trt @@ -29,12 +30,12 @@ from tensorrt_llm.builder import Builder from tensorrt_llm.layers.attention import PositionEmbeddingType from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models import BaichuanForCausalLM, weight_only_quantize +from tensorrt_llm.models import BaichuanForCausalLM, quantize_model from tensorrt_llm.network import net_guard from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.quantization import QuantMode -from weight import load_from_hf_baichuan # isort:skip +from weight import load_from_hf_baichuan, load_from_binary, parse_bin_config # isort:skip # 2 routines: get_engine_name, serialize_engine # are direct copy from gpt example, TODO: put in utils? @@ -115,9 +116,8 @@ def parse_arguments(): type=int, default=1, help='world size, only support tensor parallelism now') - parser.add_argument('--model_dir', - type=str, - default='baichuan-inc/Baichuan-13B-Chat') + parser.add_argument('--model_dir', type=str, default=None) + parser.add_argument('--bin_model_dir', type=str, default=None) parser.add_argument('--model_version', type=str, default='v1_13b', @@ -180,6 +180,38 @@ def parse_arguments(): default=False, action='store_true') + # Arguments related to the quantization of the model. + parser.add_argument( + '--use_smooth_quant', + default=False, + action="store_true", + help= + 'Use the SmoothQuant method to quantize activations and weights for the various GEMMs.' + 'See --per_channel and --per_token for finer-grained quantization options.' + ) + parser.add_argument( + '--per_channel', + default=False, + action="store_true", + help= + 'By default, we use a single static scaling factor for the GEMM\'s result. ' + 'per_channel instead uses a different static scaling factor for each channel. ' + 'The latter is usually more accurate, but a little slower.') + parser.add_argument( + '--per_token', + default=False, + action="store_true", + help= + 'By default, we use a single static scaling factor to scale activations in the int8 range. ' + 'per_token chooses at run time, and for each token, a custom scaling factor. ' + 'The latter is usually more accurate, but a little slower.') + parser.add_argument( + '--int8_kv_cache', + default=False, + action="store_true", + help= + 'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV' + ) parser.add_argument( '--use_weight_only', default=False, @@ -222,11 +254,15 @@ def parse_arguments(): args = parser.parse_args() - if args.use_weight_only: - args.quant_mode = QuantMode.use_weight_only( - args.weight_only_precision == 'int4') - else: - args.quant_mode = QuantMode(0) + assert not ( + args.use_smooth_quant and args.use_weight_only + ), "You cannot enable both SmoothQuant and INT8 weight-only together." + + if not args.remove_input_padding: + if args.use_gpt_attention_plugin: + logger.warning( + f"It is recommended to specify --remove_input_padding when using GPT attention plugin" + ) if args.use_inflight_batching: if not args.use_gpt_attention_plugin: @@ -245,6 +281,18 @@ def parse_arguments(): if args.max_num_tokens is not None: assert args.enable_context_fmha + if args.use_smooth_quant: + args.quant_mode = QuantMode.use_smooth_quant(args.per_token, + args.per_channel) + elif args.use_weight_only: + args.quant_mode = QuantMode.use_weight_only( + args.weight_only_precision == 'int4') + else: + args.quant_mode = QuantMode(0) + + if args.int8_kv_cache: + args.quant_mode = args.quant_mode.set_int8_kv_cache() + if args.model_dir is not None: hf_config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True) @@ -259,6 +307,16 @@ def parse_arguments(): args.n_positions = hf_config.model_max_length args.vocab_size = hf_config.vocab_size args.hidden_act = hf_config.hidden_act + elif args.bin_model_dir is not None: + n_embd, n_head, n_layer, n_positions, vocab_size, hidden_act, inter_size, _ = parse_bin_config( + Path(args.bin_model_dir) / "config.ini") + args.inter_size = inter_size + args.n_embd = n_embd + args.n_head = n_head + args.n_layer = n_layer + args.n_positions = n_positions + args.vocab_size = vocab_size + args.hidden_act = hidden_act else: # default values are based on v1_13b, change them based on model_version if args.model_version == 'v1_7b': @@ -286,9 +344,6 @@ def parse_arguments(): args.vocab_size = 125696 args.hidden_act = 'silu' - if args.dtype == 'bfloat16': - assert args.use_gemm_plugin, "Please use gemm plugin when dtype is bfloat16" - return args @@ -301,7 +356,10 @@ def build_rank_engine(builder: Builder, @param args: The cmd line arguments. @return: The built engine. ''' - kv_dtype = str_dtype_to_trt(args.dtype) + dtype = str_dtype_to_trt(args.dtype) + mapping = Mapping(world_size=args.world_size, + rank=rank, + tp_size=args.world_size) if args.model_version == 'v1_7b' or args.model_version == 'v2_7b': position_embedding_type = PositionEmbeddingType.rope_gpt_neox else: @@ -311,23 +369,19 @@ def build_rank_engine(builder: Builder, tensorrt_llm_baichuan = BaichuanForCausalLM( num_layers=args.n_layer, num_heads=args.n_head, + num_kv_heads=None, hidden_size=args.n_embd, vocab_size=args.vocab_size, hidden_act=args.hidden_act, max_position_embeddings=args.n_positions, position_embedding_type=position_embedding_type, - dtype=kv_dtype, + dtype=dtype, mlp_hidden_size=args.inter_size, - mapping=Mapping(world_size=args.world_size, - rank=rank, - tp_size=args.world_size)) - if args.use_weight_only and args.weight_only_precision == 'int8': - tensorrt_llm_baichuan = weight_only_quantize( - tensorrt_llm_baichuan, QuantMode.use_weight_only()) - elif args.use_weight_only and args.weight_only_precision == 'int4': - tensorrt_llm_baichuan = weight_only_quantize( - tensorrt_llm_baichuan, - QuantMode.use_weight_only(use_int4_weights=True)) + mapping=mapping, + quant_mode=args.quant_mode) + if args.use_smooth_quant or args.use_weight_only: + tensorrt_llm_baichuan = quantize_model(tensorrt_llm_baichuan, + args.quant_mode) if args.model_dir is not None: logger.info( f'Loading HF Baichuan {args.model_version} ... from {args.model_dir}' @@ -351,6 +405,12 @@ def build_rank_engine(builder: Builder, args.world_size, dtype=args.dtype) del hf_baichuan + elif args.bin_model_dir is not None: + load_from_binary(tensorrt_llm_baichuan, + args.bin_model_dir, + mapping, + fp16=(args.dtype == 'float16'), + multi_query_mode=False) # Module -> Network network = builder.create_network() @@ -360,6 +420,12 @@ def build_rank_engine(builder: Builder, dtype=args.use_gpt_attention_plugin) if args.use_gemm_plugin: network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin) + # Quantization plugins. + if args.use_smooth_quant: + network.plugin_config.set_smooth_quant_gemm_plugin(dtype=args.dtype) + network.plugin_config.set_rmsnorm_quantization_plugin(dtype=args.dtype) + network.plugin_config.set_quantize_tensor_plugin() + network.plugin_config.set_quantize_per_token_plugin() assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc) if args.enable_context_fmha: network.plugin_config.set_context_fmha(ContextFMHAType.enabled) @@ -393,7 +459,7 @@ def build_rank_engine(builder: Builder, v = v.trt_tensor v.name = k network.trt_network.mark_output(v) - v.dtype = kv_dtype + v.dtype = dtype if args.visualize: model_path = os.path.join(args.output_dir, 'test.onnx') to_onnx(network.trt_network, model_path) @@ -407,6 +473,9 @@ def build_rank_engine(builder: Builder, if rank == 0: config_path = os.path.join(args.output_dir, 'config.json') builder.save_config(builder_config, config_path) + + tensorrt_llm.tools.cleanup(network, tensorrt_llm_baichuan) + return engine @@ -425,6 +494,9 @@ def build(rank, args): # skip other ranks if parallel_build is enabled if args.parallel_build and cur_rank != rank: continue + # NOTE(nkorobov): when only int8 kv cache is used together with paged kv cache no int8 tensors are exposed to TRT + int8_trt_flag = args.quant_mode.has_act_or_weight_quant() or ( + not args.paged_kv_cache and args.quant_mode.has_int8_kv_cache()) builder_config = builder.create_builder_config( name=model_name, precision=args.dtype, @@ -441,7 +513,8 @@ def build(rank, args): max_input_len=args.max_input_len, max_output_len=args.max_output_len, max_num_tokens=args.max_num_tokens, - int8=args.quant_mode.has_act_or_weight_quant()) + int8=int8_trt_flag, + quant_mode=args.quant_mode) engine_name = get_engine_name(model_name, args.dtype, args.world_size, cur_rank) engine = build_rank_engine(builder, builder_config, engine_name, @@ -454,6 +527,7 @@ def build(rank, args): cache = builder_config.trt_builder_config.get_timing_cache() serialize_engine(engine, os.path.join(args.output_dir, engine_name)) + del engine if rank == 0: ok = builder.save_timing_cache( diff --git a/examples/baichuan/convert.py b/examples/baichuan/convert.py new file mode 100644 index 0000000000..11dd73d3ca --- /dev/null +++ b/examples/baichuan/convert.py @@ -0,0 +1,295 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + Utilities for exporting a model to our custom format. +""" +import numpy as np +import torch + + +def save_val(val, dir, key, tp_num=None): + suffix = "bin" if tp_num is None else f"{tp_num}.bin" + val.tofile(dir / f"model.{key}.{suffix}") + + +def save_split(split_vals, dir, key, i, factor): + for j, val in enumerate(split_vals): + save_val(val, dir, key, i * factor + j) + + +def generate_int8(weights, act_range, is_qkv=False, multi_query_mode=False): + """ + This function has two purposes: + - compute quantized weights, scaled either per-tensor or per-column + - compute scaling factors + + Depending on the GEMM API (CUTLASS/CUBLAS) the required scaling factors differ. + CUTLASS uses two sets of scaling factors. One for the activation X, one for the weight W. + CUBLAS only has one (we can't do per-row scaling). So we must provide pre-multiplied scaling factor. + + Here is the list of what we need (T means per-tensor, C per-column): + - scale_x_orig_quant puts fp activation into the quantized range (i.e. [-128, 127], for int8). Used before the GEMM. (T) + - scale_y_quant_orig puts quantized activation into the fp range. Used if the GEMM outputs int8. (T) + - scale_w_quant_orig puts weights from quant range to fp range (used with CUTLASS) (T, C) + - scale_y_accum_quant puts the GEMM result (XW) from accumulation range (int32) + to quant range (int8) (used for CUBLAS) (T, C) + + Note that we don't do anything special about row-parallel GEMM. Theoretically, we could have per-GPU scaling factors too, + but then the model would change depending on the number of GPUs used. + + For QKV projection, the behavior is special. Even if we have a single matrix to perform QKV projection, we consider it + as three different matrices: Q, K, and V. So per-tensor actually means one scaling factor for each Q, K and V. + For our GEMM implementation to respect this behavior, we use per-column mode and replicate values along columns. + """ + + # compute weight scaling factors for fp->int8 and int8->fp + if is_qkv and not multi_query_mode: + scale_w_orig_quant_t = 127. / act_range["w"].reshape(3, -1).max( + dim=-1, keepdims=True)[0].cpu().numpy() + scale_w_orig_quant_c = 127. / act_range["w"].reshape(3, + -1).cpu().numpy() + elif is_qkv and multi_query_mode: + hidden_dim = weights.shape[0] + local_dim = act_range["w"].shape[0] + kv_dim = (local_dim - hidden_dim) // 2 + scale_w_q = act_range["w"][0:hidden_dim] + scale_w_k = act_range["w"][hidden_dim:hidden_dim + kv_dim] + scale_w_v = act_range["w"][-kv_dim:] + + scale_w_qkv_t = torch.concat([ + scale_w_q.max(dim=0, keepdim=True)[0], + scale_w_k.max(dim=0, keepdim=True)[0], + scale_w_v.max(dim=0, keepdim=True)[0] + ]) + + scale_w_orig_quant_t = 127. / scale_w_qkv_t.cpu().numpy() + scale_w_orig_quant_c = 127. / act_range["w"].cpu().numpy() + else: + scale_w_orig_quant_t = 127. / act_range["w"].max().cpu().numpy() + scale_w_orig_quant_c = 127. / act_range["w"].cpu().numpy() + scale_w_quant_orig_t = 1.0 / scale_w_orig_quant_t + scale_w_quant_orig_c = 1.0 / scale_w_orig_quant_c + + # compute the rest of needed scaling factors + scale_x_orig_quant_t = np.array(127. / act_range["x"].max().item()) + scale_y_orig_quant_t = np.array(127. / act_range["y"].max().item()) + scale_y_quant_orig_t = np.array(act_range["y"].max().item() / 127.) + scale_y_accum_quant_t = scale_y_orig_quant_t / (scale_x_orig_quant_t * + scale_w_orig_quant_t) + scale_y_accum_quant_c = scale_y_orig_quant_t / (scale_x_orig_quant_t * + scale_w_orig_quant_c) + if is_qkv and not multi_query_mode: + scale_y_accum_quant_t = np.broadcast_to(scale_y_accum_quant_t, + scale_w_orig_quant_c.shape) + scale_w_quant_orig_t = np.broadcast_to(scale_w_quant_orig_t, + scale_w_orig_quant_c.shape) + if is_qkv and multi_query_mode: + scale_q_y_accum_t = np.broadcast_to(scale_y_accum_quant_t[0], + scale_w_q.shape) + scale_k_y_accum_t = np.broadcast_to(scale_y_accum_quant_t[1], + scale_w_k.shape) + scale_v_y_accum_t = np.broadcast_to(scale_y_accum_quant_t[2], + scale_w_v.shape) + scale_y_accum_quant_t = np.concatenate( + [scale_q_y_accum_t, scale_k_y_accum_t, scale_v_y_accum_t]) + scale_w_quant_orig_t = np.concatenate([ + np.broadcast_to(scale_w_quant_orig_t[0], scale_w_q.shape), + np.broadcast_to(scale_w_quant_orig_t[1], scale_w_k.shape), + np.broadcast_to(scale_w_quant_orig_t[2], scale_w_v.shape) + ]) + + to_i8 = lambda x: x.round().clip(-127, 127).astype(np.int8) + + if is_qkv and multi_query_mode: + scale_w_quant_orig_t_expand = np.ones([weights.shape[-1]]) + scale_w_quant_orig_t_expand[:hidden_dim] = scale_w_quant_orig_t[0] + scale_w_quant_orig_t_expand[hidden_dim:hidden_dim + + kv_dim] = scale_w_quant_orig_t[1] + scale_w_quant_orig_t_expand[-kv_dim:] = scale_w_quant_orig_t[2] + weight_int8 = to_i8(weights * scale_w_quant_orig_t_expand) + else: + weight_int8 = to_i8(weights * scale_w_orig_quant_t) + return { + "weight.int8": weight_int8, + "weight.int8.col": to_i8(weights * scale_w_orig_quant_c), + "scale_x_orig_quant": scale_x_orig_quant_t.astype(np.float32), + "scale_w_quant_orig": scale_w_quant_orig_t.astype(np.float32), + "scale_w_quant_orig.col": scale_w_quant_orig_c.astype(np.float32), + "scale_y_accum_quant": scale_y_accum_quant_t.astype(np.float32), + "scale_y_accum_quant.col": scale_y_accum_quant_c.astype(np.float32), + "scale_y_quant_orig": scale_y_quant_orig_t.astype(np.float32), + } + + +def save_multi_query_mode_qkv_int8(val, dir, base_key, saved_key, factor, rank, + local_dim, head_size): + q, k, v = np.split(val, [local_dim, local_dim + head_size], axis=-1) + q_split = np.split(q, factor, axis=-1) + k_split = np.split(k, factor, axis=-1) + v_split = np.split(v, factor, axis=-1) + split_vals = [ + np.concatenate((q_split[ii], k_split[ii], v_split[ii]), axis=-1) + for ii in range(factor) + ] + save_split(split_vals, dir, f"{base_key}.{saved_key}", rank, factor) + + +def write_int8(vals, + dir, + base_key, + split_dim, + i, + factor, + is_qkv=False, + multi_query_mode=False): + saved_keys_once = [ + "scale_x_orig_quant", "scale_w_quant_orig", "scale_y_accum_quant", + "scale_y_quant_orig" + ] + + if is_qkv and multi_query_mode: + assert split_dim == -1 + local_dim = vals["weight.int8"].shape[0] + head_size = (vals["weight.int8"].shape[1] - local_dim) // 2 + + save_multi_query_mode_qkv_int8(vals["weight.int8"], dir, base_key, + "weight.int8", factor, i, local_dim, + head_size) + save_multi_query_mode_qkv_int8(vals["weight.int8.col"], dir, base_key, + "weight.int8.col", factor, i, local_dim, + head_size) + save_multi_query_mode_qkv_int8(vals["scale_w_quant_orig.col"], dir, + base_key, "scale_w_quant_orig.col", + factor, i, local_dim, head_size) + save_multi_query_mode_qkv_int8(vals["scale_y_accum_quant.col"], dir, + base_key, "scale_y_accum_quant.col", + factor, i, local_dim, head_size) + else: + save_split(np.split(vals["weight.int8"], factor, axis=split_dim), dir, + f"{base_key}.weight.int8", i, factor) + save_split(np.split(vals["weight.int8.col"], factor, axis=split_dim), + dir, f"{base_key}.weight.int8.col", i, factor) + + if split_dim == -1: + save_split( + np.split(vals["scale_w_quant_orig.col"], factor, + axis=split_dim), dir, + f"{base_key}.scale_w_quant_orig.col", i, factor) + save_split( + np.split(vals["scale_y_accum_quant.col"], + factor, + axis=split_dim), dir, + f"{base_key}.scale_y_accum_quant.col", i, factor) + else: + saved_keys_once += [ + "scale_w_quant_orig.col", "scale_y_accum_quant.col" + ] + + if i == 0: + for save_key in saved_keys_once: + save_val(vals[save_key], dir, f"{base_key}.{save_key}") + + +def str_to_np_dtype(type_str): + convert_dict = { + "fp32": np.float32, + "fp16": np.float16, + } + dtype = convert_dict.get(type_str) + if dtype is None: + raise ValueError(f"{type_str} is an invalid storage type") + return dtype + + +def split_and_save_weight(i, saved_dir, factor, key, val, act_range, config): + # The split_factor indicates the number of ranks to implement + # distributed GEMMs. For Tensor Parallelism, each rank/GPU works + # on split_hidden_dim // split_factor channels. + + int8_outputs = config.get("int8_outputs", None) + multi_query_mode = config.get("multi_query_mode", False) + local_dim = config.get("local_dim", None) + + save_int8 = int8_outputs == "all" or int8_outputs == "kv_cache_only" + + if "input_layernorm.weight" in key or "input_layernorm.bias" in key or \ + "attention.dense.bias" in key or "post_layernorm.weight" in key or \ + "post_attention_layernorm.bias" in key or "mlp.dense_4h_to_h.bias" in key or \ + "final_layernorm.weight" in key or "final_layernorm.bias" in key: + + # shared weights, only need to convert the weights of rank 0 + if i == 0: + save_val(val, saved_dir, key) + + elif "attention.dense.weight" in key or "mlp.proj.weight" in key: + split_dim = 0 + split_vals = np.split(val, factor, axis=split_dim) + save_split(split_vals, saved_dir, key, i, factor) + if act_range is not None and int8_outputs == "all": + base_key = key.replace(".weight", "") + vals_i8 = generate_int8(val, act_range) + write_int8(vals_i8, saved_dir, base_key, split_dim, i, factor) + + elif "mlp.fc.weight" in key or "mlp.gate.weight" in key: + split_dim = -1 + split_vals = np.split(val, factor, axis=split_dim) + save_split(split_vals, saved_dir, key, i, factor) + if act_range is not None and int8_outputs == "all": + base_key = key.replace(".weight", "") + vals_i8 = generate_int8(val, act_range) + write_int8(vals_i8, saved_dir, base_key, split_dim, i, factor) + + elif "attention.query_key_value.weight" in key: + hidden_dim = val.shape[0] + if local_dim is None: + local_dim = val.shape[-1] // 3 + if multi_query_mode: + head_size = (val.shape[-1] - local_dim) // 2 + val = val.reshape(hidden_dim, local_dim + 2 * head_size) + w_q, w_k, w_v = np.split(val, [local_dim, local_dim + head_size], + axis=-1) + w_q_split = np.split(w_q, factor, axis=-1) + w_k_split = np.split(w_k, factor, axis=-1) + w_v_split = np.split(w_v, factor, axis=-1) + split_vals = [ + np.concatenate((w_q_split[ii], w_k_split[ii], w_v_split[ii]), + axis=-1) for ii in range(factor) + ] + split_dim = -1 + else: + val = val.reshape(hidden_dim, 3, local_dim) + split_dim = -1 + split_vals = np.split(val, factor, axis=split_dim) + save_split(split_vals, saved_dir, key, i, factor) + if save_int8: + base_key = key.replace(".weight", "") + vals_i8 = generate_int8(val, + act_range, + is_qkv=True, + multi_query_mode=multi_query_mode) + write_int8(vals_i8, + saved_dir, + base_key, + split_dim, + i, + factor, + is_qkv=True, + multi_query_mode=multi_query_mode) + elif "attention.dense.smoother" in key or "mlp.proj.smoother" in key: + split_vals = np.split(val, factor, axis=0) + save_split(split_vals, saved_dir, key, i, factor) + + else: + print(f"[WARNING] {key} not handled by converter") diff --git a/examples/baichuan/hf_baichuan_convert.py b/examples/baichuan/hf_baichuan_convert.py new file mode 100644 index 0000000000..73c53071f6 --- /dev/null +++ b/examples/baichuan/hf_baichuan_convert.py @@ -0,0 +1,291 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +''' +Convert Baichuan models. Use https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat as demo. +''' +import argparse +import configparser +import os +from pathlib import Path + +import torch +import torch.multiprocessing as multiprocessing +from convert import split_and_save_weight, str_to_np_dtype +from smoothquant import (capture_activation_range, smooth_gemm, + smooth_gemm_fc1_gate) +from tqdm import tqdm +from transformers import AutoModelForCausalLM, AutoTokenizer + + +@torch.no_grad() +def smooth_baichuan_model(model, scales, alpha, baichuan_smoother): + # Smooth the activation and weights with smoother = $\diag{s}$ + for name, module in model.named_modules(): + class_name = module.__class__.__name__ + if not 'Layer' in class_name: + continue + print(f'smoothing module: {name}, class_name: {class_name}') + # qkv_proj + layer_name_qkv = name + ".self_attn.W_pack" + + smoother = smooth_gemm(module.self_attn.W_pack.weight, + scales[layer_name_qkv]["x"], + module.input_layernorm.weight, None, alpha) + + scales[layer_name_qkv]["x"] = scales[layer_name_qkv]["x"] / smoother + scales[layer_name_qkv]["w"] = module.self_attn.W_pack.weight.abs().max( + dim=1)[0] + + # ================================================================= + layer_name = name + ".self_attn.o_proj" + smoother = smooth_gemm(module.self_attn.o_proj.weight, + scales[layer_name]["x"], None, None, alpha) + baichuan_smoother[layer_name] = smoother.float() + + scales[layer_name]["x"] = scales[layer_name]["x"] / smoother + scales[layer_name]["w"] = module.self_attn.o_proj.weight.abs().max( + dim=1)[0] + + # ================================================================== + fc1_layer_name = name + ".mlp.gate_proj" + gate_layer_name = name + ".mlp.up_proj" + + smoother = smooth_gemm_fc1_gate(module.mlp.gate_proj.weight, + module.mlp.up_proj.weight, + scales[fc1_layer_name]["x"], + module.post_attention_layernorm.weight, + None, alpha) + + scales[fc1_layer_name]["x"] = scales[fc1_layer_name]["x"] / smoother + scales[fc1_layer_name]["w"] = module.mlp.gate_proj.weight.abs().max( + dim=1)[0] + + scales[gate_layer_name]["x"] = scales[gate_layer_name]["x"] / smoother + scales[gate_layer_name]["w"] = module.mlp.up_proj.weight.abs().max( + dim=1)[0] + + # ================================================================== + layer_name = name + ".mlp.down_proj" + smoother = smooth_gemm(module.mlp.down_proj.weight, + scales[layer_name]["x"], None, None, alpha) + baichuan_smoother[layer_name] = smoother.float() + scales[layer_name]["x"] = scales[layer_name]["x"] / smoother + scales[layer_name]["w"] = module.mlp.down_proj.weight.abs().max( + dim=1)[0] + + +def baichuan_to_bin_name(orig_name): + global_bin_weights = { + "model.embed_tokens.weight": 'vocab_embedding.weight', + "model.norm.weight": 'ln_f.weight', + "lm_head.weight": 'lm_head.weight', + } + + if orig_name in global_bin_weights: + return global_bin_weights[orig_name] + + _, _, layer_id, *weight_name = orig_name.split(".") + + layer_id = int(layer_id) + weight_name = ".".join(weight_name) + + per_layer_weights = { + "input_layernorm.weight": "input_layernorm.weight", + "self_attn.W_pack.weight": "attention.query_key_value.weight", + "self_attn.o_proj.weight": "attention.dense.weight", + "mlp.gate_proj.weight": "mlp.fc.weight", + "mlp.down_proj.weight": "mlp.proj.weight", + "mlp.up_proj.weight": "mlp.gate.weight", + "post_attention_layernorm.weight": "post_layernorm.weight", + } + + return f"layers.{layer_id}.{per_layer_weights[weight_name]}" + + +# Baichuan uses nn.Linear for these following ops whose weight matrix is transposed compared to gpt2. +# In order to use the preprocess codes of gpt2, we transpose them firstly. +def transpose_weights(hf_name, param): + weight_to_transpose = [ + "W_pack", "o_proj", "gate_proj", "down_proj", "up_proj" + ] + if any([k in hf_name for k in weight_to_transpose]): + if len(param.shape) == 2: + param = param.transpose(0, 1) + return param + + +def hf_baichuan_converter(args): + infer_tp = args.tensor_parallelism + saved_dir = Path(args.out_dir) / f"{infer_tp}-gpu" + saved_dir.mkdir(parents=True, exist_ok=True) + + model = AutoModelForCausalLM.from_pretrained(args.in_file, + device_map="auto", + trust_remote_code=True) + + act_range = {} + # smoother for inputs of self_attn.o_proj and mlp.down_proj + baichuan_smoother = {} + + if args.smoothquant is not None or args.calibrate_kv_cache: + os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get( + "TOKENIZERS_PARALLELISM", "false") + act_range = capture_activation_range( + model, + AutoTokenizer.from_pretrained(args.in_file, + use_fast=False, + trust_remote_code=True)) + if args.smoothquant is not None: + smooth_baichuan_model(model, act_range, args.smoothquant, + baichuan_smoother) + + config = configparser.ConfigParser() + config["baichuan"] = {} + for key in vars(args): + config["baichuan"][key] = f"{vars(args)[key]}" + for k, v in vars(model.config).items(): + config["baichuan"][k] = f"{v}" + config["baichuan"]["weight_data_type"] = args.storage_type + config["baichuan"]["multi_query_mode"] = str(False) + with open(saved_dir / "config.ini", 'w') as configfile: + config.write(configfile) + + storage_type = str_to_np_dtype(args.storage_type) + + global_bin_weights = [ + 'vocab_embedding.weight', 'ln_f.weight', 'lm_head.weight' + ] + + int8_outputs = None + if args.calibrate_kv_cache: + int8_outputs = "kv_cache_only" + if args.smoothquant is not None: + int8_outputs = "all" + + starmap_args = [] + for name, param in model.named_parameters(): + if "weight" not in name and "bias" not in name: + continue + bin_name = baichuan_to_bin_name(name) + + if name.replace(".weight", "") in baichuan_smoother.keys(): + smoother = baichuan_smoother[name.replace(".weight", "")] + smoother = smoother.detach().cpu().numpy() + starmap_args.append( + (0, saved_dir, infer_tp, + f"{bin_name}.smoother".replace(".weight", + ""), smoother, None, { + "int8_outputs": + int8_outputs, + "multi_query_mode": False, + "local_dim": None, + })) + + param = transpose_weights(name, param) + + param = param.detach().cpu().numpy().astype(storage_type) + + if bin_name in global_bin_weights: + param.tofile(saved_dir / f"{bin_name}.bin") + elif bin_name.split('.')[-2] == 'query_key_value': + local_dim = None + layer_name_qkv = name.replace(".weight", "") + # Baichuan models use W_pack to transform qkv + # So we can simply use param as qkv weight here + qkv = (0, saved_dir, infer_tp, bin_name, param, + act_range.get(layer_name_qkv), { + "int8_outputs": int8_outputs, + "multi_query_mode": False, + "local_dim": local_dim, + }) + starmap_args.append(qkv) + elif bin_name.split('.')[-2] == 'kv': + continue + else: + starmap_args.append((0, saved_dir, infer_tp, bin_name, param, + act_range.get(name.replace(".weight", "")), { + "int8_outputs": int8_outputs, + "multi_query_mode": False, + "local_dim": None, + })) + + starmap_args = tqdm(starmap_args, desc="saving weights") + if args.processes > 1: + with multiprocessing.Pool(args.processes) as pool: + pool.starmap(split_and_save_weight, starmap_args) + else: + # simpler for debug situations + for starmap_arg in starmap_args: + split_and_save_weight(*starmap_arg) + + +if __name__ == "__main__": + torch.multiprocessing.set_start_method("spawn") + + parser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--out-dir', + '-o', + type=str, + help='file name of output directory', + required=True) + parser.add_argument('--in-file', + '-i', + type=str, + help='file name of input checkpoint file', + required=True) + parser.add_argument('--tensor-parallelism', + '-tp', + type=int, + help='Requested tensor parallelism for inference', + default=1) + parser.add_argument( + "--processes", + "-p", + type=int, + help="How many processes to spawn for conversion (default: 4)", + default=4) + parser.add_argument( + "--calibrate-kv-cache", + "-kv", + action="store_true", + help= + "Generate scaling factors for KV cache. Used for storing KV cache in int8." + ) + parser.add_argument( + "--smoothquant", + "-sq", + type=float, + default=None, + help="Set the α parameter (see https://arxiv.org/pdf/2211.10438.pdf)" + " to Smoothquant the model, and output int8 weights." + " A good first try is 0.5. Must be in [0, 1]") + parser.add_argument("--storage-type", + "-t", + type=str, + default="fp32", + choices=["fp32", "fp16"]) + + args = parser.parse_args() + print("\n=============== Argument ===============") + for key in vars(args): + print("{}: {}".format(key, vars(args)[key])) + print("========================================") + + assert (args.calibrate_kv_cache or args.smoothquant), \ + ("Either INT8 kv cache or SmoothQuant must be enabled for this script. " + "Otherwise you can directly build engines from HuggingFace checkpoints," + " no need to do this bin format conversion. ") + hf_baichuan_converter(args) diff --git a/examples/baichuan/run.py b/examples/baichuan/run.py index 18d2a5bbf7..6c05bd11ca 100644 --- a/examples/baichuan/run.py +++ b/examples/baichuan/run.py @@ -23,6 +23,7 @@ import torch from transformers import AutoTokenizer import tensorrt_llm +from tensorrt_llm.quantization import QuantMode from tensorrt_llm.runtime import ModelConfig, SamplingConfig from build import get_engine_name # isort:skip @@ -31,6 +32,75 @@ EOS_TOKEN = 2 PAD_TOKEN = 0 +def read_config(config_path: Path): + with open(config_path, 'r') as f: + config = json.load(f) + use_gpt_attention_plugin = config['plugin_config']['gpt_attention_plugin'] + remove_input_padding = config['plugin_config']['remove_input_padding'] + dtype = config['builder_config']['precision'] + world_size = config['builder_config']['tensor_parallel'] + assert world_size == tensorrt_llm.mpi_world_size(), \ + f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})' + num_heads = config['builder_config']['num_heads'] // world_size + hidden_size = config['builder_config']['hidden_size'] // world_size + vocab_size = config['builder_config']['vocab_size'] + num_layers = config['builder_config']['num_layers'] + paged_kv_cache = config['plugin_config']['paged_kv_cache'] + tokens_per_block = config['plugin_config']['tokens_per_block'] + quant_mode = QuantMode(config['builder_config']['quant_mode']) + + model_config = ModelConfig(num_heads=num_heads, + num_kv_heads=num_heads, + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + gpt_attention_plugin=use_gpt_attention_plugin, + paged_kv_cache=paged_kv_cache, + tokens_per_block=tokens_per_block, + remove_input_padding=remove_input_padding, + dtype=dtype, + quant_mode=quant_mode) + + return model_config, world_size, dtype + + +def parse_input(input_text: str, input_file: str, tokenizer, end_id: int, + remove_input_padding: bool): + input_tokens = [] + if input_file is None: + input_tokens.append( + tokenizer.encode(input_text, add_special_tokens=False)) + else: + if input_file.endswith('.csv'): + with open(input_file, 'r') as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + for line in csv_reader: + input_tokens.append(np.array(line, dtype='int32')) + elif input_file.endswith('.npy'): + inputs = np.load(input_file) + for row in inputs: + row = row[row != end_id] + input_tokens.append(row) + else: + print('Input file format not supported.') + raise SystemExit + + input_ids = None + input_lengths = torch.tensor([len(x) for x in input_tokens], + dtype=torch.int32, + device='cuda') + if remove_input_padding: + input_ids = np.concatenate(input_tokens) + input_ids = torch.tensor(input_ids, dtype=torch.int32, + device='cuda').unsqueeze(0) + else: + input_ids = torch.nested.to_padded_tensor( + torch.nested.nested_tensor(input_tokens, dtype=torch.int32), + end_id).cuda() + + return input_ids, input_lengths + + def parse_arguments(): parser = argparse.ArgumentParser() parser.add_argument('--max_output_len', type=int, required=True) @@ -67,6 +137,38 @@ def parse_arguments(): return parser.parse_args() +def print_output(output_ids, input_lengths, max_output_len, tokenizer, + output_csv, output_npy): + num_beams = output_ids.size(1) + if output_csv is None and output_npy is None: + for b in range(input_lengths.size(0)): + inputs = output_ids[b][0][:input_lengths[b]].tolist() + input_text = tokenizer.decode(inputs) + print(f'Input: \"{input_text}\"') + for beam in range(num_beams): + output_begin = input_lengths[b] + output_end = input_lengths[b] + max_output_len + outputs = output_ids[b][beam][output_begin:output_end].tolist() + output_text = tokenizer.decode(outputs) + print(f'Output: \"{output_text}\"') + + output_ids = output_ids.reshape((-1, output_ids.size(2))) + + if output_csv is not None: + output_file = Path(output_csv) + output_file.parent.mkdir(exist_ok=True, parents=True) + outputs = output_ids.tolist() + with open(output_file, 'w') as csv_file: + writer = csv.writer(csv_file, delimiter=',') + writer.writerows(outputs) + + if output_npy is not None: + output_file = Path(output_npy) + output_file.parent.mkdir(exist_ok=True, parents=True) + outputs = np.array(output_ids.cpu().contiguous(), dtype='int32') + np.save(output_file, outputs) + + def generate( max_output_len: int, log_level: str = 'error', @@ -81,21 +183,9 @@ def generate( ): tensorrt_llm.logger.set_level(log_level) - config_path = os.path.join(engine_dir, 'config.json') - with open(config_path, 'r') as f: - config = json.load(f) - use_gpt_attention_plugin = config['plugin_config']['gpt_attention_plugin'] - remove_input_padding = config['plugin_config']['remove_input_padding'] - paged_kv_cache = config['plugin_config']['paged_kv_cache'] - tokens_per_block = config['plugin_config']['tokens_per_block'] - dtype = config['builder_config']['precision'] - world_size = config['builder_config']['tensor_parallel'] - assert world_size == tensorrt_llm.mpi_world_size(), \ - f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})' - num_heads = config['builder_config']['num_heads'] // world_size - hidden_size = config['builder_config']['hidden_size'] // world_size - vocab_size = config['builder_config']['vocab_size'] - num_layers = config['builder_config']['num_layers'] + engine_dir = Path(engine_dir) + config_path = engine_dir / 'config.json' + model_config, world_size, dtype = read_config(config_path) runtime_rank = tensorrt_llm.mpi_rank() runtime_mapping = tensorrt_llm.Mapping(world_size, @@ -107,17 +197,6 @@ def generate( use_fast=False, trust_remote_code=True) - model_config = ModelConfig(num_heads=num_heads, - num_kv_heads=num_heads, - hidden_size=hidden_size, - vocab_size=vocab_size, - num_layers=num_layers, - gpt_attention_plugin=use_gpt_attention_plugin, - paged_kv_cache=paged_kv_cache, - tokens_per_block=tokens_per_block, - remove_input_padding=remove_input_padding, - dtype=dtype) - repetition_penalty = 1.1 temperature = 0.3 top_k = 5 @@ -144,45 +223,9 @@ def generate( engine_buffer, runtime_mapping) - input_tokens = [] - if input_file is None: - input_tokens.append( - tokenizer.encode(input_text, add_special_tokens=False)) - else: - if input_file.endswith('.csv'): - with open(input_file, 'r') as csv_file: - csv_reader = csv.reader(csv_file, delimiter=',') - for line in csv_reader: - input_tokens.append(np.array(line, dtype='int32')) - elif input_file.endswith('.npy'): - inputs = np.load(input_file) - for row in inputs: - row = row[row != EOS_TOKEN] - input_tokens.append(row) - else: - print('Input file format not supported.') - raise SystemExit - - input_ids = None - input_lengths = None - if input_file is None: - input_ids = torch.tensor(input_tokens, dtype=torch.int32, device='cuda') - input_lengths = torch.tensor([input_ids.size(1)], - dtype=torch.int32, - device='cuda') - else: - input_lengths = torch.tensor([len(x) for x in input_tokens], - dtype=torch.int32, - device='cuda') - if remove_input_padding: - input_ids = np.concatenate(input_tokens) - input_ids = torch.tensor(input_ids, - dtype=torch.int32, - device='cuda').unsqueeze(0) - else: - input_ids = torch.nested.to_padded_tensor( - torch.nested.nested_tensor(input_tokens, dtype=torch.int32), - EOS_TOKEN).cuda() + input_ids, input_lengths = parse_input(input_text, input_file, tokenizer, + EOS_TOKEN, + model_config.remove_input_padding) max_input_length = torch.max(input_lengths).item() decoder.setup(input_lengths.size(0), @@ -194,41 +237,8 @@ def generate( torch.cuda.synchronize() if runtime_rank == 0: - if output_csv is None and output_npy is None: - for b in range(input_lengths.size(0)): - inputs = input_tokens[b] - input_text = tokenizer.decode(inputs) - print(f'Input: \"{input_text}\"') - if num_beams <= 1: - output_begin = max_input_length - outputs = output_ids[b][0][output_begin:].tolist() - output_text = tokenizer.decode(outputs) - print(f'Output: \"{output_text}\"') - else: - for beam in range(num_beams): - output_begin = input_lengths[b] - output_end = input_lengths[b] + max_output_len - outputs = output_ids[b][beam][ - output_begin:output_end].tolist() - output_text = tokenizer.decode(outputs) - print(f'Output: \"{output_text}\"') - - output_ids = output_ids.reshape((-1, output_ids.size(2))) - - if output_csv is not None: - output_file = Path(output_csv) - output_file.parent.mkdir(exist_ok=True, parents=True) - outputs = output_ids.tolist() - with open(output_file, 'w') as csv_file: - writer = csv.writer(csv_file, delimiter=',') - writer.writerows(outputs) - - if output_npy is not None: - output_file = Path(output_npy) - output_file.parent.mkdir(exist_ok=True, parents=True) - outputs = np.array(output_ids.cpu().contiguous(), dtype='int32') - np.save(output_file, outputs) - return + print_output(output_ids, input_lengths, max_output_len, tokenizer, + output_csv, output_npy) if __name__ == '__main__': diff --git a/examples/chatglm2-6b/smoothquant.py b/examples/baichuan/smoothquant.py similarity index 69% rename from examples/chatglm2-6b/smoothquant.py rename to examples/baichuan/smoothquant.py index 0c8dcaa5d4..4e4145cb4e 100644 --- a/examples/chatglm2-6b/smoothquant.py +++ b/examples/baichuan/smoothquant.py @@ -16,6 +16,7 @@ Utilities for SmoothQuant models ''' +import copy import functools from collections import defaultdict @@ -78,6 +79,45 @@ def smooth_gemm(gemm_weights, return scales +@torch.no_grad() +def smooth_gemm_fc1_gate(fc1_weights, + gate_weights, + act_scales, + layernorm_weights=None, + layernorm_bias=None, + alpha=0.5, + weight_scales=None): + gemm_weights = [] + if not isinstance(fc1_weights, list): + fc1_weights = [fc1_weights] + if not isinstance(gate_weights, list): + gate_weights = [gate_weights] + + for i in range(len(fc1_weights)): + gemm_weight = torch.cat([fc1_weights[i], gate_weights[i]], dim=0) + gemm_weights.append(gemm_weight) + + orig_dtype = gemm_weights[0].dtype + + for gemm in gemm_weights: + # gemm_weights are expected to be transposed + assert gemm.shape[1] == act_scales.numel() + + if weight_scales is None: + weight_scales = torch.cat( + [gemm.abs().max(dim=0, keepdim=True)[0] for gemm in gemm_weights], + dim=0) + weight_scales = weight_scales.max(dim=0)[0] + weight_scales.to(float).clamp(min=1e-5) + scales = (act_scales.to(gemm_weights[0].device).to(float).pow(alpha) / + weight_scales.pow(1 - alpha)).clamp(min=1e-5) + + apply_smoothing(scales, fc1_weights + gate_weights, layernorm_weights, + layernorm_bias, orig_dtype) + + return scales + + @torch.no_grad() def smooth_ln_fcs(ln, fcs, act_scales, alpha=0.5): if not isinstance(fcs, list): @@ -107,9 +147,12 @@ def smooth_ln_fcs(ln, fcs, act_scales, alpha=0.5): @torch.no_grad() def capture_activation_range(model, tokenizer, num_samples=512, seq_len=512): model.eval() - device = next(model.parameters()).device + next(model.parameters()).device act_scales = defaultdict(lambda: {"x": None, "y": None, "w": None}) + test_token_num = 923 + tokenizer.pad_token = tokenizer.eos_token + def stat_tensor(name, tensor, act_scales, key): hidden_dim = tensor.shape[-1] tensor = tensor.view(-1, hidden_dim).abs().detach() @@ -129,7 +172,7 @@ def capture_activation_range(model, tokenizer, num_samples=512, seq_len=512): if act_scales[name]["w"] is None: act_scales[name]["w"] = m.weight.abs().clip(1e-8, - None).max(dim=0)[0] + None).max(dim=1)[0] hooks = [] for name, m in model.named_modules(): @@ -139,14 +182,21 @@ def capture_activation_range(model, tokenizer, num_samples=512, seq_len=512): functools.partial(stat_input_hook, name=name))) from datasets import load_dataset - dataset = load_dataset("lambada", split="validation") + dataset_cnn = load_dataset("ccdv/cnn_dailymail", '3.0.0') for i in tqdm(range(num_samples), desc="calibrating model"): - input_ids = tokenizer(dataset[i]["text"], - return_tensors="pt", - max_length=seq_len, - truncation=True).input_ids.to(device) - model(input_ids) + datapoint = dataset_cnn['train'][i:i + 1] + line = copy.copy(datapoint['article']) + line[0] = line[0] + ' TL;DR: ' + line[0] = line[0].strip() + line[0] = line[0].replace(" n't", "n't") + line_encoded = tokenizer(line, + return_tensors="pt", + padding=True, + truncation=True)["input_ids"].type(torch.int64) + line_encoded = line_encoded[:, -test_token_num:] + line_encoded = line_encoded.cuda() + model(line_encoded) for h in hooks: h.remove() diff --git a/examples/baichuan/summarize.py b/examples/baichuan/summarize.py index 27201ac92b..b825123ef3 100644 --- a/examples/baichuan/summarize.py +++ b/examples/baichuan/summarize.py @@ -25,6 +25,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer import tensorrt_llm import tensorrt_llm.profiler as profiler from tensorrt_llm.logger import logger +from tensorrt_llm.quantization import QuantMode from build import get_engine_name # isort:skip @@ -35,7 +36,6 @@ def TRTBaichuan(args, config): assert world_size == tensorrt_llm.mpi_world_size(), \ f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})' - world_size = config['builder_config']['tensor_parallel'] num_heads = config['builder_config']['num_heads'] // world_size hidden_size = config['builder_config']['hidden_size'] // world_size vocab_size = config['builder_config']['vocab_size'] @@ -45,6 +45,7 @@ def TRTBaichuan(args, config): remove_input_padding = config['plugin_config']['remove_input_padding'] paged_kv_cache = config['plugin_config']['paged_kv_cache'] tokens_per_block = config['plugin_config']['tokens_per_block'] + quant_mode = QuantMode(config['builder_config']['quant_mode']) model_config = tensorrt_llm.runtime.ModelConfig( vocab_size=vocab_size, @@ -56,7 +57,8 @@ def TRTBaichuan(args, config): tokens_per_block=tokens_per_block, remove_input_padding=remove_input_padding, paged_kv_cache=paged_kv_cache, - dtype=dtype) + dtype=dtype, + quant_mode=quant_mode) runtime_rank = tensorrt_llm.mpi_rank() runtime_mapping = tensorrt_llm.Mapping(world_size, diff --git a/examples/baichuan/weight.py b/examples/baichuan/weight.py index f777158ff1..7c3bc687c5 100644 --- a/examples/baichuan/weight.py +++ b/examples/baichuan/weight.py @@ -12,13 +12,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import configparser import time +from pathlib import Path import numpy as np import torch import tensorrt_llm from tensorrt_llm._utils import str_dtype_to_torch, torch_to_numpy +from tensorrt_llm.mapping import Mapping +from tensorrt_llm.models import BaichuanForCausalLM from tensorrt_llm.quantization import QuantMode @@ -81,7 +85,7 @@ def load_from_hf_baichuan(tensorrt_llm_baichuan, if layer_idx is None: continue idx = int(layer_idx) - if idx >= tensorrt_llm_baichuan._num_layers: + if idx >= tensorrt_llm_baichuan.num_layers: continue if 'input_layernorm.weight' in k: tensorrt_llm_baichuan.layers[ @@ -163,3 +167,332 @@ def load_from_hf_baichuan(tensorrt_llm_baichuan, tok = time.time() t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}') + + +def parse_bin_config(ini_file): + baichuan_config = configparser.ConfigParser() + baichuan_config.read(ini_file) + + n_embd = baichuan_config.getint('baichuan', 'hidden_size') + n_head = baichuan_config.getint('baichuan', 'num_attention_heads') + n_kv_head = n_head + n_layer = baichuan_config.getint('baichuan', 'num_hidden_layers') + if baichuan_config.has_option('baichuan', 'max_position_embeddings'): + n_positions = baichuan_config.getint('baichuan', + 'max_position_embeddings') + else: + n_positions = baichuan_config.getint('baichuan', 'model_max_length') + vocab_size = baichuan_config.getint('baichuan', 'vocab_size') + hidden_act = baichuan_config.get('baichuan', 'hidden_act') + inter_size = baichuan_config.getint('baichuan', + 'intermediate_size', + fallback=None) + + if inter_size is None: + inter_size = 4 * n_embd + + return n_embd, n_head, n_layer, n_positions, vocab_size, hidden_act, inter_size, n_kv_head + + +def gen_suffix(rank, use_smooth_quant, quant_per_channel): + suffix = f"{rank}.bin" + if use_smooth_quant: + sq_prefix = "int8." + if quant_per_channel: + sq_prefix += "col." + suffix = sq_prefix + suffix + return suffix + + +def load_from_binary(tensorrt_llm_baichuan: BaichuanForCausalLM, + dir_path, + mapping=Mapping(), + fp16=False, + multi_query_mode=False): + tensorrt_llm.logger.info('Loading weights from binary...') + tik = time.time() + + quant_mode = getattr(tensorrt_llm_baichuan, 'quant_mode', QuantMode(0)) + + n_embd, n_head, n_layer, n_positions, vocab_size, hidden_act, inter_size, n_kv_head = parse_bin_config( + Path(dir_path) / 'config.ini') + np_dtype = np.float16 if fp16 else np.float32 + + def fromfile(dir_path, name, shape=None, dtype=None): + dtype = np_dtype if dtype is None else dtype + p = dir_path + '/' + name + if Path(p).exists(): + t = np.fromfile(p, dtype=dtype) + if shape is not None: + t = t.reshape(shape) + return t + return None + + def set_smoothquant_scale_factors(module, + pre_scale_weight, + dir_path, + basename, + shape, + per_tok_dyn, + per_channel, + is_qkv=False, + rank=None): + suffix = "bin" + if per_channel: + if rank is not None: + suffix = f"{rank}." + suffix + suffix = "col." + suffix + + col_shape = shape if (per_channel or is_qkv) else [1, 1] + + if per_tok_dyn: + if pre_scale_weight is not None: + pre_scale_weight.value = np.array([1.0], dtype=np.float32) + if is_qkv and not per_channel: + t = fromfile(dir_path, + f"{basename}scale_w_quant_orig.{rank}.{suffix}", + col_shape, np.float32) + else: + t = fromfile(dir_path, f"{basename}scale_w_quant_orig.{suffix}", + col_shape, np.float32) + module.per_channel_scale.value = t + else: + t = fromfile(dir_path, f"{basename}scale_x_orig_quant.bin", [1], + np.float32) + pre_scale_weight.value = t + if is_qkv: + t = fromfile(dir_path, + f"{basename}scale_y_accum_quant.{rank}.{suffix}", + col_shape, np.float32) + else: + t = fromfile(dir_path, + f"{basename}scale_y_accum_quant.{suffix}", + col_shape, np.float32) + module.per_channel_scale.value = t + t = fromfile(dir_path, f"{basename}scale_y_quant_orig.bin", [1, 1], + np.float32) + module.act_scale.value = t + + def set_smoother(module, dir_path, base_name, shape, rank): + suffix = f"{rank}.bin" + t = fromfile(dir_path, f"{base_name}.smoother.{suffix}", shape, + np.float32) + module.smoother.value = t + + # Determine the quantization mode. + quant_mode = getattr(tensorrt_llm_baichuan, "quant_mode", QuantMode(0)) + if quant_mode.is_int8_weight_only(): + plugin_weight_only_quant_type = torch.int8 + elif quant_mode.is_int4_weight_only(): + plugin_weight_only_quant_type = torch.quint4x2 + # Do we use SmoothQuant? + use_smooth_quant = quant_mode.has_act_and_weight_quant() + # Do we use quantization per token? + quant_per_token_dyn = quant_mode.has_per_token_dynamic_scaling() + # Do we use quantization per channel? + quant_per_channel = quant_mode.has_per_channel_scaling() + + # Do we use INT4/INT8 weight-only? + use_weight_only = quant_mode.is_weight_only() + + # Int8 KV cache + use_int8_kv_cache = quant_mode.has_int8_kv_cache() + + # Debug + suffix = gen_suffix(mapping.tp_rank, use_smooth_quant, quant_per_channel) + # The type of weights. + w_type = np_dtype if not use_smooth_quant else np.int8 + + if mapping.is_first_pp_rank(): + tensorrt_llm_baichuan.vocab_embedding.weight.value = (fromfile( + dir_path, 'vocab_embedding.weight.bin', [vocab_size, n_embd])) + + if mapping.is_last_pp_rank(): + tensorrt_llm_baichuan.ln_f.weight.value = (fromfile( + dir_path, 'ln_f.weight.bin')) + # share input embedding + lm_head_weight = fromfile(dir_path, 'lm_head.weight.bin', + [vocab_size, n_embd]) + + if vocab_size % mapping.tp_size != 0: + # padding + vocab_size_padded = tensorrt_llm_baichuan.lm_head.out_features * mapping.tp_size + pad_width = vocab_size_padded - vocab_size + lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)), + 'constant', + constant_values=0) + if mapping.is_last_pp_rank(): + tensorrt_llm_baichuan.lm_head.weight.value = np.ascontiguousarray( + split(lm_head_weight, mapping.tp_size, mapping.tp_rank)) + + layers_range = list( + range(mapping.pp_rank * tensorrt_llm_baichuan.num_layers, + (mapping.pp_rank + 1) * tensorrt_llm_baichuan.num_layers, 1)) + + for i in layers_range: + n_groups = n_head // n_kv_head + c_attn_out_dim = ( + 3 * n_embd // mapping.tp_size) if not multi_query_mode else ( + n_embd // mapping.tp_size + + (n_embd // n_head * n_groups) // mapping.tp_size * 2) + idx = i - mapping.pp_rank * tensorrt_llm_baichuan.num_layers + tensorrt_llm_baichuan.layers[idx].input_layernorm.weight.value = ( + fromfile(dir_path, + 'model.layers.' + str(i) + '.input_layernorm.weight.bin')) + t = fromfile( + dir_path, 'model.layers.' + str(i) + + '.attention.query_key_value.weight.' + suffix, + [n_embd, c_attn_out_dim], w_type) + if t is not None: + dst = tensorrt_llm_baichuan.layers[idx].attention.qkv.weight + if use_smooth_quant: + dst.value = np.ascontiguousarray(np.transpose(t, [1, 0])) + set_smoothquant_scale_factors( + tensorrt_llm_baichuan.layers[idx].attention.qkv, + tensorrt_llm_baichuan.layers[idx].input_layernorm. + scale_to_int, + dir_path, + 'model.layers.' + str(i) + '.attention.query_key_value.', + [1, c_attn_out_dim], + quant_per_token_dyn, + quant_per_channel, + rank=mapping.tp_rank, + is_qkv=True) + elif use_weight_only: + processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + torch.tensor(t), plugin_weight_only_quant_type) + dst.value = processed_torch_weights.numpy() + scales = tensorrt_llm_baichuan.layers[ + i].attention.qkv.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + dst.value = np.ascontiguousarray(np.transpose(t, [1, 0])) + + dst = tensorrt_llm_baichuan.layers[idx].attention.dense.weight + t = fromfile( + dir_path, + 'model.layers.' + str(i) + '.attention.dense.weight.' + suffix, + [n_embd // mapping.tp_size, n_embd], w_type) + if use_smooth_quant: + dst.value = np.ascontiguousarray(np.transpose(t, [1, 0])) + dense_scale = getattr(tensorrt_llm_baichuan.layers[idx].attention, + "quantization_scaling_factor", None) + set_smoothquant_scale_factors( + tensorrt_llm_baichuan.layers[idx].attention.dense, dense_scale, + dir_path, 'model.layers.' + str(i) + '.attention.dense.', + [1, n_embd], quant_per_token_dyn, quant_per_channel) + set_smoother(tensorrt_llm_baichuan.layers[idx].attention.dense, + dir_path, + 'model.layers.' + str(i) + '.attention.dense', + [1, n_embd // mapping.tp_size], mapping.tp_rank) + elif use_weight_only: + processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + torch.tensor(t), plugin_weight_only_quant_type) + dst.value = processed_torch_weights.numpy() + scales = tensorrt_llm_baichuan.layers[ + i].attention.dense.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + dst.value = np.ascontiguousarray(np.transpose(t, [1, 0])) + + dst = tensorrt_llm_baichuan.layers[idx].post_layernorm.weight + dst.value = fromfile( + dir_path, 'model.layers.' + str(i) + '.post_layernorm.weight.bin') + + t = fromfile(dir_path, + 'model.layers.' + str(i) + '.mlp.fc.weight.' + suffix, + [n_embd, inter_size // mapping.tp_size], w_type) + + if use_smooth_quant: + tensorrt_llm_baichuan.layers[ + idx].mlp.fc.weight.value = np.ascontiguousarray( + np.transpose(t, [1, 0])) + set_smoothquant_scale_factors( + tensorrt_llm_baichuan.layers[idx].mlp.fc, + tensorrt_llm_baichuan.layers[idx].post_layernorm.scale_to_int, + dir_path, + 'model.layers.' + str(i) + '.mlp.fc.', + [1, inter_size // mapping.tp_size], + quant_per_token_dyn, + quant_per_channel, + rank=mapping.tp_rank) + elif use_weight_only: + dst = tensorrt_llm_baichuan.layers[i].mlp.fc.weight + processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + torch.tensor(t), plugin_weight_only_quant_type) + dst.value = processed_torch_weights.numpy() + scales = tensorrt_llm_baichuan.layers[i].mlp.fc.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + tensorrt_llm_baichuan.layers[ + idx].mlp.fc.weight.value = np.ascontiguousarray( + np.transpose(t, [1, 0])) + + t = fromfile(dir_path, + 'model.layers.' + str(i) + '.mlp.gate.weight.' + suffix, + [n_embd, inter_size // mapping.tp_size], w_type) + if use_smooth_quant: + tensorrt_llm_baichuan.layers[ + idx].mlp.gate.weight.value = np.ascontiguousarray( + np.transpose(t, [1, 0])) + set_smoothquant_scale_factors( + tensorrt_llm_baichuan.layers[idx].mlp.gate, + tensorrt_llm_baichuan.layers[idx].post_layernorm.scale_to_int, + dir_path, + 'model.layers.' + str(i) + '.mlp.gate.', + [1, inter_size // mapping.tp_size], + quant_per_token_dyn, + quant_per_channel, + rank=mapping.tp_rank) + elif use_weight_only: + dst = tensorrt_llm_baichuan.layers[i].mlp.gate.weight + processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + torch.tensor(t), plugin_weight_only_quant_type) + dst.value = processed_torch_weights.numpy() + scales = tensorrt_llm_baichuan.layers[i].mlp.gate.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + tensorrt_llm_baichuan.layers[ + idx].mlp.gate.weight.value = np.ascontiguousarray( + np.transpose(t, [1, 0])) + + t = fromfile(dir_path, + 'model.layers.' + str(i) + '.mlp.proj.weight.' + suffix, + [inter_size // mapping.tp_size, n_embd], w_type) + if use_smooth_quant: + tensorrt_llm_baichuan.layers[ + idx].mlp.proj.weight.value = np.ascontiguousarray( + np.transpose(t, [1, 0])) + proj_scale = getattr(tensorrt_llm_baichuan.layers[idx].mlp, + "quantization_scaling_factor", None) + set_smoothquant_scale_factors( + tensorrt_llm_baichuan.layers[idx].mlp.proj, proj_scale, + dir_path, 'model.layers.' + str(i) + '.mlp.proj.', [1, n_embd], + quant_per_token_dyn, quant_per_channel) + set_smoother(tensorrt_llm_baichuan.layers[idx].mlp.proj, dir_path, + 'model.layers.' + str(i) + '.mlp.proj', + [1, inter_size // mapping.tp_size], mapping.tp_rank) + elif use_weight_only: + dst = tensorrt_llm_baichuan.layers[i].mlp.proj.weight + processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + torch.tensor(t), plugin_weight_only_quant_type) + dst.value = processed_torch_weights.numpy() + scales = tensorrt_llm_baichuan.layers[i].mlp.proj.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + tensorrt_llm_baichuan.layers[idx].mlp.proj.weight.value = ( + np.ascontiguousarray(np.transpose(t, [1, 0]))) + + if use_int8_kv_cache: + t = fromfile( + dir_path, 'model.layers.' + str(i) + + '.attention.query_key_value.scale_y_quant_orig.bin', [1], + np.float32) + tensorrt_llm_baichuan.layers[ + idx].attention.kv_orig_quant_scale.value = 1.0 / t + tensorrt_llm_baichuan.layers[ + idx].attention.kv_quant_orig_scale.value = t + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}') diff --git a/examples/bloom/build.py b/examples/bloom/build.py index 54362d1449..ab4cffd806 100644 --- a/examples/bloom/build.py +++ b/examples/bloom/build.py @@ -27,7 +27,7 @@ from tensorrt_llm._utils import str_dtype_to_trt from tensorrt_llm.builder import Builder from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models import smooth_quantize, weight_only_quantize +from tensorrt_llm.models import quantize_model from tensorrt_llm.network import net_guard from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.quantization import QuantMode @@ -345,12 +345,8 @@ def build_rank_engine(builder: Builder, embedding_sharding_dim=args.embedding_sharding_dim, share_embedding_table=share_embedding_table, quant_mode=args.quant_mode) - if args.use_smooth_quant: - tensorrt_llm_bloom = smooth_quantize(tensorrt_llm_bloom, - args.quant_mode) - elif args.use_weight_only: - tensorrt_llm_bloom = weight_only_quantize(tensorrt_llm_bloom, - args.quant_mode) + if args.use_weight_only or args.use_smooth_quant: + tensorrt_llm_bloom = quantize_model(tensorrt_llm_bloom, args.quant_mode) if args.model_dir is not None: logger.info(f'Loading HF BLOOM ... from {args.model_dir}') @@ -442,6 +438,9 @@ def build_rank_engine(builder: Builder, if rank == 0: config_path = os.path.join(args.output_dir, 'config.json') builder.save_config(builder_config, config_path) + + tensorrt_llm.tools.cleanup(network, tensorrt_llm_bloom) + return engine @@ -491,6 +490,7 @@ def build(rank, args): cache = builder_config.trt_builder_config.get_timing_cache() serialize_engine(engine, os.path.join(args.output_dir, engine_name)) + del engine if rank == 0: ok = builder.save_timing_cache( diff --git a/examples/chatglm2-6b/.gitignore b/examples/chatglm/.gitignore similarity index 57% rename from examples/chatglm2-6b/.gitignore rename to examples/chatglm/.gitignore index baa5534912..979e236242 100644 --- a/examples/chatglm2-6b/.gitignore +++ b/examples/chatglm/.gitignore @@ -1,5 +1,6 @@ __pycache__/ -pyTorchModel/ +chatglm*-6b/ +chatglm*-6b-32k/ trtModel/ dataset/ .vscode/ diff --git a/examples/chatglm/README.md b/examples/chatglm/README.md new file mode 100644 index 0000000000..74042ce268 --- /dev/null +++ b/examples/chatglm/README.md @@ -0,0 +1,144 @@ +# ChatGLM + +This document explains how to build the [ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b), [ChatGLM2-6B](https://huggingface.co/THUDM/chatglm2-6b) and [ChatGLM3-6B](https://huggingface.co/THUDM/chatglm3-6b), [ChatGLM2-6B-32k](https://huggingface.co/THUDM/chatglm2-6b-32k), [ChatGLM3-6B-32k](https://huggingface.co/THUDM/chatglm3-6b-32k) models using TensorRT-LLM and run on a single GPU, a single node with multiple GPUs or multiple nodes with multiple GPUs. + +## Overview + +The TensorRT-LLM ChatGLM implementation can be found in [`tensorrt_llm/models/chatglm/model.py`](../../tensorrt_llm/models/chatglm/model.py). +The TensorRT-LLM ChatGLM example code is located in [`examples/chatglm`](./). There are 3 main files in that folder: + +* [`build.py`](./build.py) to build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run the ChatGLM model. +* [`run.py`](./run.py) to run the inference on an input text. +* [`summarize.py`](./summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset using the model. + +## Support Matrix + +* FP16 +* Weight Only Quantization (int8 / int4) +* Paged KV cache +* Remove Input Padding +* Tensor Parallel +* Strongly Typed + +## Usage + +The next section describe how to build the engine and run the inference demo. + +### 1. Download repo and weights from HuggingFace Transformers + +```bash +pip install -r requirements.txt +apt-get update +apt-get install git-lfs +rm -rf chatglm* + +# clone one or more models we want to build +git clone https://huggingface.co/THUDM/chatglm-6b +git clone https://huggingface.co/THUDM/chatglm2-6b +git clone https://huggingface.co/THUDM/chatglm3-6b +git clone https://huggingface.co/THUDM/chatglm2-6b-32k +git clone https://huggingface.co/THUDM/chatglm3-6b-32k +``` + +### 2. Build TensorRT engine(s) + +* This ChatGLM example in TensorRT-LLM builds TensorRT engine(s) using HF checkpoint directly (rather than using FT checkpoints such as GPT example). +* If no checkpoint directory is specified, TensorRT-LLM will build engine(s) using dummy weights. +* The [`build.py`](./build.py) script requires a single GPU to build the TensorRT engine(s). +* You can enable parallel builds to accelerate the engine building process if you have more than one GPU in your system (of the same model). +* For parallel building, add the `--parallel_build` argument to the build command (this feature cannot take advantage of more than a single node). +* The number of TensorRT engines depends on the number of GPUs that will be used to run inference. +* argument [--model_version/-m] is required, which can be one of "1", "2", "3", "2-32k" or "3-32k" for ChatGLM-6B, ChatGLM2-6B, ChatGLM3-6B, ChatGLM2-6B-32K or ChatGLM3-6B-32K respectively. + +#### Examples of build invocations + +```bash +# Build a default engine of ChatGLM3-6B on single GPU with FP16, GPT Attention plugin, Gemm plugin, RMS Normolization plugin +python3 build.py -m 3 + +# Build a engine on single GPU with FMHA kernels (see introduction below), other configurations are the same as default example +python3 build.py -m 3 --enable_context_fmha # or --enable_context_fmha_fp32_acc + +# Build a engine on single GPU with int8/int4 Weight-Only quantization, other configurations are the same as default example +python3 build.py -m 3 --use_weight_only # or --use_weight_only --weight_only_precision int4 + +# Build a engine on single GPU with int8_kv_cache and remove_input_padding, other configurations are the same as default example +python3 build.py -m 3 --paged_kv_cache --remove_input_padding + +# Build a engine on two GPU, other configurations are the same as default example +python3 build.py -m 3 --world_size 2 + +# Build a engine of ChatGLM-6B on single GPU, other configurations are the same as default example +python3 build.py -m 1 + +# Build a engine of ChatGLM2-6B on single GPU, other configurations are the same as default example +python3 build.py -m 2 + +# Build a engine of ChatGLM2-6B-32k on single GPU, other configurations are the same as default example +python3 build.py -m 2-32k + +# Build a engine of ChatGLM3-6B-32k on single GPU, other configurations are the same as default example +python3 build.py -m 3-32k +``` + +#### Enabled plugins + +* Use `--use_gemm_plugin ` to configure GPT Attention plugin (default as float16) +* Use `--use_gemm_plugin ` to configure GEMM normolization plugin (default as float16) +* Use `--use_layernorm_plugin ` (for ChatGLM-6B) to configure RMS normolization plugin (default as float16) +* Use `--use_rmsnorm_plugin ` (for ChatGLM2-6B and ChatGLM3-6B) to configure RMS normolization plugin (default as float16) + +#### Fused MultiHead Attention (FMHA) + +* Use `--enable_context_fmha` or `--enable_context_fmha_fp32_acc` to enable FMHA kernels, which can provide better performance and low GPU memory occupation. + +* Switch `--use_gpt_attention_plugin float16` must be used when using FMHA. + +* `--enable_context_fmha` uses FP16 accumulator, which might cause low accuracy. In this case, `--enable_context_fmha_fp32_acc` should be used to protect accuracy at a cost of small performance drop. + +#### Weight Only quantization + +* Use `--use_weight_only` to enable INT8-Weight-Only quantization, this will siginficantly lower the latency and memory footprint. + +* Furthermore, use `--weight_only_precision int8` or `--weight_only_precision int4` to configure the data type of the weights. + +#### In-flight batching and paged KV cache [TODO] + +* The engine must be built accordingly if [in-flight batching in C++ runtime](../../docs/in_flight_batching.md) will be used. + +* Use `--use_inflight_batching` to enable In-flight Batching. + +* Switch `--use_gpt_attention_plugin=float16`, `--paged_kv_cache`, `--remove_input_padding` will be set when using In-flight Batching. + +* It is possible to use `--use_gpt_attention_plugin float32` In-flight Batching. + +* The size of the block in paged KV cache can be conteoled additionally by using `--tokens_per_block=N`. + +### 3. Run + +#### Single node, single GPU + +```bash +# Run the default engine of ChatGLM3-6B on single GPU, other model version is available if built. +python3 run.py -m 3 +``` + +#### Single node, multi GPU + +```bash +# Run the Tensor Parallel 2 engine of ChatGLM3-6B on two GPU, other model version is available if built. +mpirun -n 2 python run.py -m 3 +``` + +* `--allow-run-as-root` might be needed if using `mpirun` as root. + +#### Run comparison of performance and accuracy + +```bash +# Run the summarization of ChatGLM3-6B task, other model version is available if built. +python3 summarize.py -m 3 +``` + +## Benchmark + +* The TensorRT-LLM ChatGLM benchmark is located in [benchmarks/](../../benchmarks/README.md) diff --git a/examples/chatglm2-6b/build.py b/examples/chatglm/build.py similarity index 83% rename from examples/chatglm2-6b/build.py rename to examples/chatglm/build.py index d2fc9084b0..88a8687cb2 100644 --- a/examples/chatglm2-6b/build.py +++ b/examples/chatglm/build.py @@ -27,14 +27,11 @@ import tensorrt_llm from tensorrt_llm.builder import Builder from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models import (ChatGLM2_6BHeadModel, smooth_quantize, - weight_only_quantize) +from tensorrt_llm.models import ChatGLMHeadModel, quantize_model from tensorrt_llm.network import net_guard from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.quantization import QuantMode -MODEL_NAME = "chatglm2-6b" - def get_engine_name(model, dtype, tp_size, rank): return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank) @@ -61,11 +58,20 @@ def serialize_engine(engine, path): def parse_arguments(args): parser = argparse.ArgumentParser() + parser.add_argument( + '--model_version', + '-m', + type=str, + required=True, + choices=["1", "2", "3", "2-32k", "3-32k"], + help= + '1, 2, 3, 2-32k, 3-32k for ChatGLM-6B, ChatGLM2-6B, ChatGLM3-6B, ChatGLM2-32k and ChatGLM3-32k respectively' + ) parser.add_argument('--world_size', type=int, default=1, help='world size, only support tensor parallelism now') - parser.add_argument('--model_dir', type=str, default="pyTorchModel") + parser.add_argument('--model_dir', type=str, default=None) parser.add_argument('--dtype', type=str, default='float16', @@ -105,6 +111,16 @@ def parse_arguments(args): help= "Activates GEMM plugin. You can specify the plugin dtype or leave blank to use the model dtype." ) + parser.add_argument( + '--use_layernorm_plugin', + nargs='?', + const='float16', + type=str, + default='float16', + choices=['float32', 'float16', 'bfloat16', False], + help= + "Activates layernorm plugin for ChatGLM-6B. You can specify the plugin dtype or leave blank to use the model dtype." + ) parser.add_argument( '--use_rmsnorm_plugin', nargs='?', @@ -113,7 +129,7 @@ def parse_arguments(args): default='float16', choices=['float32', 'float16', 'bfloat16', False], help= - "Activates rmsnorm plugin. You can specify the plugin dtype or leave blank to use the model dtype.", + "Activates rmsnorm plugin for ChatGLM2-6B / ChatGLM3-6B. You can specify the plugin dtype or leave blank to use the model dtype." ) parser.add_argument('--gather_all_token_logits', action='store_true', @@ -242,13 +258,10 @@ def parse_arguments(args): args = parser.parse_args(args) logger.set_level(args.log_level) - args.apply_query_key_layer_scaling = False # always False in TRT-LLM - args.hidden_act = 'swiglu' - args.multi_block_mode = False - plugins_args = [ 'use_gpt_attention_plugin', 'use_gemm_plugin', + 'use_layernorm_plugin', 'use_rmsnorm_plugin', ] for plugin_arg in plugins_args: @@ -258,30 +271,51 @@ def parse_arguments(args): ) setattr(args, plugin_arg, args.dtype) - assert args.model_dir is not None + if args.model_version == "1": + args.model_name = "chatglm-6b" + elif args.model_version in ["2", "3"]: + args.model_name = "chatglm%s-6b" % args.model_version + else: + args.model_name = "chatglm%s-6b-32k" % args.model_version.split("-")[0] + if args.model_dir is None: + args.model_dir = args.model_name with open(Path(args.model_dir) / "config.json", "r") as f: js = json.loads(f.read()) - assert js["_name_or_path"] == "THUDM/" + MODEL_NAME - - args.apply_residual_connection_post_layernorm = js[ - "apply_residual_connection_post_layernorm"] + if args.model_version == "1": + assert args.max_input_len < js["max_sequence_length"] + args.apply_query_key_layer_scaling = False # always False in TRT-LLM args.eos_token_id = js["eos_token_id"] - args.ffn_hidden_size = js["ffn_hidden_size"] args.hidden_size = js["hidden_size"] - args.kv_channels = js["kv_channels"] - args.layernorm_epsilon = js["layernorm_epsilon"] - args.linear_bias = js["add_bias_linear"] - args.multi_query_mode = js["multi_query_attention"] - args.max_seq_length = min(args.max_input_len + args.max_output_len, - js["seq_length"]) - args.num_kv_heads = js["multi_query_group_num"] + args.multi_block_mode = False + args.norm_epsilon = js["layernorm_epsilon"] args.num_heads = js["num_attention_heads"] args.num_layers = js["num_layers"] args.pad_token_id = js["pad_token_id"] - args.qkv_bias = js["add_qkv_bias"] - args.rmsnorm = js["rmsnorm"] args.use_cache = js["use_cache"] - args.vocab_size = js["padded_vocab_size"] + if args.model_version == "1": + args.ffn_hidden_size = js["inner_hidden_size"] + args.hidden_act = 'gelu' + args.linear_bias = True # always True in ChatGLM-6B + args.max_seq_length = min(args.max_input_len + args.max_output_len, + js["max_sequence_length"]) + args.multi_query_mode = False # always False in ChatGLM-6B + args.num_kv_heads = js["num_attention_heads"] + args.qkv_bias = True # always True in ChatGLM-6B + args.vocab_size = js["vocab_size"] + else: + #args.kv_channels = js["kv_channels"] # useless + args.apply_residual_connection_post_layernorm = js[ + "apply_residual_connection_post_layernorm"] + args.ffn_hidden_size = js["ffn_hidden_size"] + args.hidden_act = 'swiglu' + args.linear_bias = js["add_bias_linear"] + args.max_seq_length = min(args.max_input_len + args.max_output_len, + js["seq_length"]) + args.multi_query_mode = js["multi_query_attention"] + args.num_kv_heads = js["multi_query_group_num"] + args.qkv_bias = js["add_qkv_bias"] + args.rmsnorm = js["rmsnorm"] + args.vocab_size = js["padded_vocab_size"] if args.use_inflight_batching: if not args.use_gpt_attention_plugin: @@ -344,13 +378,10 @@ def build_rank_engine(builder: Builder, rank=rank, tp_size=args.world_size, ) - trtllm_model = ChatGLM2_6BHeadModel(args=args) - - if args.use_smooth_quant: - trtllm_model = smooth_quantize(trtllm_model, args.quant_mode) - elif args.use_weight_only: - trtllm_model = weight_only_quantize(trtllm_model, args.quant_mode) + trtllm_model = ChatGLMHeadModel(args=args) + if args.use_smooth_quant or args.use_weight_only: + trtllm_model = quantize_model(trtllm_model, args.quant_mode) if args.model_dir is not None: hf_model = transformers.AutoModel.from_pretrained( args.model_dir, trust_remote_code=True).cpu() @@ -359,6 +390,7 @@ def build_rank_engine(builder: Builder, hf_model, mapping=args.mapping, dtype=args.dtype, + model_version=args.model_version, ) del hf_model @@ -370,6 +402,9 @@ def build_rank_engine(builder: Builder, dtype=args.use_gpt_attention_plugin) if args.use_gemm_plugin: network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin) + if args.use_layernorm_plugin: + network.plugin_config.set_layernorm_plugin( + dtype=args.use_layernorm_plugin) if args.use_rmsnorm_plugin: network.plugin_config.set_rmsnorm_plugin(dtype=args.use_rmsnorm_plugin) assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc) @@ -420,8 +455,11 @@ def build_rank_engine(builder: Builder, # Network -> Engine engine = builder.build_engine(network, builder_config) if rank == 0: - config_path = args.output_dir / 'config.json' + config_path = args.output_dir / (args.model_name + '-config.json') builder.save_config(builder_config, config_path) + + tensorrt_llm.tools.cleanup(network, trtllm_model) + return engine @@ -453,25 +491,26 @@ def build(rank, args): hidden_act=args.hidden_act, hidden_size=args.hidden_size, max_batch_size=args.max_batch_size, + max_beam_width=args.max_beam_width, max_input_len=args.max_input_len, max_num_tokens=args.max_output_len + args.max_input_len, max_output_len=args.max_output_len, max_position_embeddings=args.max_seq_length, multi_query_mode=args.multi_query_mode, - name=MODEL_NAME, + name=args.model_name, num_heads=args.num_heads, num_kv_heads=args.num_kv_heads, num_layers=args.num_layers, pad_token_id=args.pad_token_id, paged_kv_cache=args.paged_kv_cache, parallel_build=args.parallel_build, - quant_mode=int(args.quant_mode), + quant_mode=args.quant_mode, remove_input_padding=args.remove_input_padding, vocab_size=args.vocab_size, ) - engine_name = get_engine_name(MODEL_NAME, args.dtype, args.world_size, - cur_rank) + engine_name = get_engine_name(args.model_name, args.dtype, + args.world_size, cur_rank) engine = build_rank_engine(builder, builder_config, engine_name, cur_rank, args) assert engine is not None, f'Failed to build engine for rank {cur_rank}' @@ -483,6 +522,7 @@ def build(rank, args): ) serialize_engine(engine, args.output_dir / engine_name) + del engine if rank == 0: ok = builder.save_timing_cache(builder_config, timing_cache_file) diff --git a/examples/chatglm2-6b/requirements.txt b/examples/chatglm/requirements.txt similarity index 100% rename from examples/chatglm2-6b/requirements.txt rename to examples/chatglm/requirements.txt diff --git a/examples/chatglm6b/run.py b/examples/chatglm/run.py similarity index 53% rename from examples/chatglm6b/run.py rename to examples/chatglm/run.py index ba80c679cc..6eaf71b4b4 100644 --- a/examples/chatglm6b/run.py +++ b/examples/chatglm/run.py @@ -23,16 +23,23 @@ import transformers import tensorrt_llm from tensorrt_llm.quantization import QuantMode -from tensorrt_llm.runtime import (ChatGLM6BHeadModelGenerationSession, +from tensorrt_llm.runtime import (ChatGLMGenerationSession, GenerationSession, ModelConfig, SamplingConfig) from build import find_engines # isort:skip -MODEL_NAME = "chatglm-6b" - -def parse_arguments(): +def parse_arguments(args=None): parser = argparse.ArgumentParser() + parser.add_argument( + '--model_version', + '-m', + type=str, + default="3", + choices=["1", "2", "3", "2-32k", "3-32k"], + help= + '1, 2, 3, 2-32k, 3-32k for ChatGLM-6B, ChatGLM2-6B, ChatGLM3-6B, ChatGLM2-32k and ChatGLM3-32k respectively' + ) parser.add_argument('--max_output_len', type=int, default=1024) parser.add_argument('--log_level', type=str, default='error') parser.add_argument('--engine_dir', type=str, default='trtModel') @@ -41,7 +48,10 @@ def parse_arguments(): '--input_text', type=str, nargs='*', - default=["Hello", "Could you introduce NVIDIA Corporation for me?"], + default=[ + "What's new between ChatGLM3-6B and ChatGLM2-6B?", + "Could you introduce NVIDIA Corporation for me?", + ], ) parser.add_argument( '--input_tokens', @@ -53,14 +63,14 @@ def parse_arguments(): parser.add_argument( '--tokenizer_dir', type=str, - default='pyTorchModel', + default=None, help='Directory containing the tokenizer model.', ) parser.add_argument('--temperature', type=float, default=1.0) parser.add_argument('--top_k', type=int, default=1) parser.add_argument('--top_p', type=float, default=0.0) parser.add_argument('--random_seed', type=int, default=1) - return parser.parse_args() + return parser.parse_args(args) def process_response(responseList): @@ -87,42 +97,81 @@ if __name__ == '__main__': args = parse_arguments() tensorrt_llm.logger.set_level(args.log_level) - config_path = os.path.join(args.engine_dir, 'config.json') + if args.model_version == "1": + model_name = "chatglm-6b" + elif args.model_version in ["2", "3"]: + model_name = "chatglm%s-6b" % args.model_version + else: + model_name = "chatglm%s-6b-32k" % args.model_version.split("-")[0] + + config_path = os.path.join(args.engine_dir, model_name + '-config.json') with open(config_path, 'r') as f: config = json.load(f) - assert (config['builder_config']['name'] == MODEL_NAME) + dtype = config['builder_config']['precision'] end_id = config['builder_config']['eos_token_id'] pad_id = config['builder_config']['pad_token_id'] max_batch_size = config['builder_config']['max_batch_size'] + max_input_len = config['builder_config']['max_input_len'] + max_output_len = config['builder_config']['max_output_len'] + max_beam_width = config['builder_config']['max_beam_width'] + remove_input_padding = config['builder_config']['remove_input_padding'] use_gpt_attention_plugin = config['plugin_config']['gpt_attention_plugin'] world_size = config['builder_config']['tensor_parallel'] assert world_size == tensorrt_llm.mpi_world_size( ), f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})' + if args.max_output_len > max_output_len: + print("Truncate max_output_len as %d" % max_output_len) + max_output_len = min(max_output_len, args.max_output_len) + if args.beam_width > max_beam_width: + print("Truncate beam_width as %d" % max_beam_width) + beam_width = min(max_beam_width, args.beam_width) + runtime_rank = tensorrt_llm.mpi_rank() - runtime_mapping = tensorrt_llm.Mapping(world_size, - runtime_rank, - tp_size=world_size) + runtime_mapping = tensorrt_llm.Mapping( + world_size, + runtime_rank, + tp_size=world_size, + ) torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node) - serialize_path = find_engines(Path(args.engine_dir), - dtype=dtype, - tp_size=world_size, - rank=runtime_rank)[0] + serialize_path = find_engines( + Path(args.engine_dir), + model_name=model_name, + dtype=dtype, + tp_size=world_size, + rank=runtime_rank, + )[0] + if args.tokenizer_dir is None: + args.tokenizer_dir = model_name tokenizer = transformers.AutoTokenizer.from_pretrained( args.tokenizer_dir, trust_remote_code=True) input_ids = None input_text = None if args.input_tokens is None: - input_text = args.input_text[:max_batch_size] + input_text = args.input_text + batch_size = len(input_text) + if batch_size > max_batch_size: + print("Truncate batch_size as %d" % max_batch_size) + batch_size = max_batch_size + input_text = input_text[:max_batch_size] tokenized = tokenizer(input_text, return_tensors="pt", padding=True, return_length=True) - input_ids = tokenized['input_ids'].int().contiguous().cuda() - input_lengths = tokenized['length'].int().contiguous().cuda() + input_ids = tokenized['input_ids'].int() + input_lengths = tokenized['length'].int() + max_input_len_real = torch.max(input_lengths) + if max_input_len_real > max_input_len: + print("Truncate input_length as %d" % max_input_len) + input_ids = input_ids[:, :max_input_len] + input_lengths = torch.where(input_lengths > max_input_len, + max_input_len, input_lengths) + else: + max_input_len = max_input_len_real + else: input_ids = [] with open(args.input_tokens) as f_in: @@ -133,7 +182,24 @@ if __name__ == '__main__': input_ids = torch.tensor(input_ids, dtype=torch.int32).cuda().unsqueeze(0) - if use_gpt_attention_plugin: + input_ids_padding = input_ids.clone() + if remove_input_padding: + input_ids_no_padding = torch.zeros(1, + torch.sum(input_lengths), + dtype=torch.int32) + lengths_acc = torch.cumsum( + torch.cat([torch.IntTensor([0]), input_lengths]), + dim=0, + ) + for i in range(len(input_ids)): + input_ids_no_padding[ + 0, lengths_acc[i]:lengths_acc[i + 1]] = torch.IntTensor( + input_ids[i, + max_input_len - input_lengths[i]:max_input_len]) + + input_ids = input_ids_no_padding + + elif use_gpt_attention_plugin: # when using gpt attention plugin, inputs needs to align at the head input_ids_padding_right = torch.zeros_like(input_ids) + end_id for i, sample in enumerate(input_ids): @@ -155,7 +221,7 @@ if __name__ == '__main__': hidden_size=config['builder_config']['hidden_size'] // world_size, gpt_attention_plugin=use_gpt_attention_plugin, remove_input_padding=config['builder_config']['remove_input_padding'], - model_name=MODEL_NAME, + model_name=model_name, paged_kv_cache=config['builder_config']['paged_kv_cache'], quant_mode=QuantMode(config['builder_config']['quant_mode']), dtype=dtype, @@ -164,7 +230,7 @@ if __name__ == '__main__': sampling_config = SamplingConfig( end_id=end_id, pad_id=pad_id, - num_beams=args.beam_width, + num_beams=beam_width, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, @@ -173,32 +239,49 @@ if __name__ == '__main__': with open(serialize_path, 'rb') as f: engine_buffer = f.read() - decoder = ChatGLM6BHeadModelGenerationSession( - model_config, - engine_buffer, - runtime_mapping, - ) - decoder.setup(input_ids.size(0), input_ids.size(1), args.max_output_len, - args.beam_width) - output_ids = decoder.decode(input_ids, input_lengths, sampling_config) - torch.cuda.synchronize() - for i in range(len(output_ids.tolist())): - output_beams_list = [ - tokenizer.batch_decode(output_ids[batch_idx, :, - input_lengths[batch_idx]:], - skip_special_tokens=True) - for batch_idx in range(input_ids.size(0)) - ] - output_text = process_response(output_beams_list[i]) - end = torch.where(input_ids[i] == end_id)[0] - inputLength = int(end[0]) if len(end) > 0 else input_ids.shape[1] - print("\nInput %2d ---> len=%d\n%s" % (i, inputLength, input_text[i])) - print("\nOutput %2d --->" % i) - for j, simple_output in enumerate(output_text): - end = torch.where(output_ids[i, j, input_lengths[i]:] == end_id)[0] - outputLength = int(end[0]) if len(end) > 0 else args.max_output_len - print(" Beam %2d ---> len=%d\n%s" % - (j, outputLength, simple_output)) + if model_name == "chatglm-6b": + decoder = ChatGLMGenerationSession( + model_config, + engine_buffer, + runtime_mapping, + ) + else: + decoder = GenerationSession( + model_config, + engine_buffer, + runtime_mapping, + ) + decoder.setup( + len(input_text), + max_input_len, + max_output_len, + beam_width, + ) + output = decoder.decode( + input_ids.contiguous().cuda(), + input_lengths.contiguous().cuda(), + sampling_config, + output_sequence_lengths=True, + return_dict=True, + ) + torch.cuda.synchronize() + output_ids = output["output_ids"] + output_lengths = output["sequence_lengths"] + + if runtime_rank == 0: + for i in range(batch_size): + print("\nInput %2d ---> len=%d\n%s" % + (i, input_lengths[i], input_text[i])) + print("\nOutput %2d --->" % i) + output_ids__one_batch = output_ids[i, :, input_lengths[i]:] + output_lengths_one_batch = output_lengths[i] + output_token_list = tokenizer.batch_decode(output_ids__one_batch, + skip_special_tokens=True) + output_token_list = process_response(output_token_list) + for j, (length, simple_output) in enumerate( + zip(output_lengths_one_batch, output_token_list)): + print("\n Beam %2d ---> len=%d\n%s" % + (j, length, simple_output)) print("Finished!") diff --git a/examples/chatglm6b/smoothquant.py b/examples/chatglm/smoothquant.py similarity index 96% rename from examples/chatglm6b/smoothquant.py rename to examples/chatglm/smoothquant.py index 0c8dcaa5d4..163592ff2c 100644 --- a/examples/chatglm6b/smoothquant.py +++ b/examples/chatglm/smoothquant.py @@ -105,7 +105,11 @@ def smooth_ln_fcs(ln, fcs, act_scales, alpha=0.5): @torch.no_grad() -def capture_activation_range(model, tokenizer, num_samples=512, seq_len=512): +def capture_activation_range(model, + tokenizer, + dataset, + num_samples=512, + seq_len=512): model.eval() device = next(model.parameters()).device act_scales = defaultdict(lambda: {"x": None, "y": None, "w": None}) @@ -138,9 +142,6 @@ def capture_activation_range(model, tokenizer, num_samples=512, seq_len=512): m.register_forward_hook( functools.partial(stat_input_hook, name=name))) - from datasets import load_dataset - dataset = load_dataset("lambada", split="validation") - for i in tqdm(range(num_samples), desc="calibrating model"): input_ids = tokenizer(dataset[i]["text"], return_tensors="pt", diff --git a/examples/chatglm6b/summarize.py b/examples/chatglm/summarize.py similarity index 90% rename from examples/chatglm6b/summarize.py rename to examples/chatglm/summarize.py index 2e2a42107a..086bc68c33 100644 --- a/examples/chatglm6b/summarize.py +++ b/examples/chatglm/summarize.py @@ -26,16 +26,17 @@ from transformers import AutoModel, AutoTokenizer import tensorrt_llm import tensorrt_llm.profiler as profiler from tensorrt_llm.logger import logger -from tensorrt_llm.runtime import (ChatGLM6BHeadModelGenerationSession, +from tensorrt_llm.runtime import (ChatGLMGenerationSession, GenerationSession, ModelConfig, SamplingConfig) from build import find_engines # isort:skip -MODEL_NAME = "chatglm-6b" +model_name = "" def TRT(args, config): + model_name = config['builder_config']['name'] dtype = config['builder_config']['precision'] world_size = config['builder_config']['tensor_parallel'] assert world_size == tensorrt_llm.mpi_world_size(), \ @@ -45,11 +46,12 @@ def TRT(args, config): remove_input_padding = config['plugin_config']['remove_input_padding'] model_config = ModelConfig( - model_name=MODEL_NAME, + model_name=model_name, vocab_size=config['builder_config']['vocab_size'], num_layers=config['builder_config']['num_layers'], num_heads=config['builder_config']['num_heads'] // world_size, - num_kv_heads=config['builder_config']['num_heads'] // world_size, + num_kv_heads=max(config['builder_config']['num_kv_heads'] // world_size, + 1), hidden_size=config['builder_config']['hidden_size'] // world_size, gpt_attention_plugin=bool( config['plugin_config']['gpt_attention_plugin']), @@ -66,20 +68,31 @@ def TRT(args, config): tp_size=world_size) torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node) - serialize_path = find_engines(args.engine_dir, - dtype=dtype, - tp_size=world_size, - rank=runtime_rank)[0] + serialize_path = find_engines( + args.engine_dir, + model_name=model_name, + dtype=dtype, + tp_size=world_size, + rank=runtime_rank, + )[0] tensorrt_llm.logger.set_level(args.log_level) with open(serialize_path, 'rb') as f: engine_buffer = f.read() - decoder = ChatGLM6BHeadModelGenerationSession( - model_config, - engine_buffer, - runtime_mapping, - ) + + if model_name == 'chatglm-6b': + decoder = ChatGLMGenerationSession( + model_config, + engine_buffer, + runtime_mapping, + ) + else: + decoder = GenerationSession( + model_config, + engine_buffer, + runtime_mapping, + ) return decoder @@ -159,7 +172,10 @@ def main(args): line[i], return_tensors='pt', ).type(torch.int32) - input_id = input_id[:, -test_token_num:] + if model_name == 'chatglm-6b': + input_id = input_id[:, -test_token_num:] + else: + input_id = input_id[:, :test_token_num] line_encoded.append(input_id) input_lengths.append(input_id.shape[-1]) @@ -239,7 +255,10 @@ def main(args): line[i], return_tensors='pt', ).type(torch.int64) - input_id = input_id[:, -test_token_num:] + if model_name == 'chatglm-6b': + input_id = input_id[:, -test_token_num:] + else: + input_id = input_id[:, :test_token_num] line_encoded.append(input_id) input_lengths.append(input_id.shape[-1]) @@ -393,7 +412,16 @@ def main(args): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--hf_model_location', type=str, default='pyTorchModel') + parser.add_argument( + '--model_version', + '-m', + type=str, + required=True, + choices=["1", "2", "3", "2-32k", "3-32k"], + help= + '1, 2, 3, 2-32k, 3-32k for ChatGLM-6B, ChatGLM2-6B, ChatGLM3-6B, ChatGLM2-32k and ChatGLM3-32k respectively' + ) + parser.add_argument('--hf_model_location', type=str, default=None) parser.add_argument( '--tokenizer', default=None, @@ -424,6 +452,15 @@ if __name__ == '__main__': parser.add_argument('--length_penalty', type=float, default=1.0) args = parser.parse_args() + + if args.model_version == "1": + args.model_name = "chatglm-6b" + elif args.model_version in ["2", "3"]: + args.model_name = "chatglm%s-6b" % args.model_version + else: + args.model_name = "chatglm%s-6b-32k" % args.model_version.split("-")[0] + if args.tokenizer == None: - args.tokenizer = args.hf_model_location + args.tokenizer = args.model_name + main(args) diff --git a/examples/chatglm/weight.py b/examples/chatglm/weight.py new file mode 100644 index 0000000000..4961c499ee --- /dev/null +++ b/examples/chatglm/weight.py @@ -0,0 +1,366 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import time + +import torch +import torch.nn.functional as F + +import tensorrt_llm +from tensorrt_llm._utils import str_dtype_to_torch, torch_to_numpy +from tensorrt_llm.quantization import QuantMode + + +def tile_kv_weight_bias(v, kv_num_head, tp_size): + head_size = v.shape[0] // kv_num_head + reps = tp_size // kv_num_head + if v.ndim == 1: + v = v.reshape(kv_num_head, head_size)[:, None, :] + v = v.expand(kv_num_head, reps, head_size).reshape(-1).clone() + else: + hidden_size = v.shape[1] + v = v.reshape(kv_num_head, head_size, hidden_size)[:, None, :, :] + v = v.expand(kv_num_head, reps, head_size, + hidden_size).reshape(-1, hidden_size).clone() + return v + + +def split_qkv(v, tp_size, rank, hidden_size, num_heads, num_kv_heads): + head_size = hidden_size // num_heads + if tp_size == 1: + return v + + assert v.shape[0] == hidden_size + head_size * num_kv_heads * 2 + query = v[:hidden_size] + key = v[hidden_size:hidden_size + head_size * num_kv_heads] + value = v[hidden_size + head_size * num_kv_heads:hidden_size + + head_size * num_kv_heads * 2] + + if num_kv_heads < tp_size: + key = tile_kv_weight_bias(key, num_kv_heads, tp_size) + value = tile_kv_weight_bias(value, num_kv_heads, tp_size) + assert (key.shape[0] % (tp_size * head_size)) == 0 + assert (value.shape[0] % (tp_size * head_size)) == 0 + + q_tmp = torch.chunk(query, tp_size, dim=0)[rank] + k_tmp = torch.chunk(key, tp_size, dim=0)[rank] + v_tmp = torch.chunk(value, tp_size, dim=0)[rank] + return torch.concatenate([q_tmp, k_tmp, v_tmp], dim=0).contiguous() + + +def load_quant_weight(src, value_dst, scale_dst, plugin_weight_only_quant_type): + v = torch.transpose(src, dim0=0, dim1=1).contiguous() + processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + v, plugin_weight_only_quant_type) + value_dst.value = torch_to_numpy(processed_torch_weights) + scale_dst.value = torch_to_numpy(torch_weight_scales) + + +def load_from_hf( + trt_model, + hf_model, + mapping=None, + dtype="float32", + model_version="3", + multi_query_mode=False, +): + # [TODO] Merge model_version=="1" and model_version>="2" + tensorrt_llm.logger.info("Loading weights from HF") + tik = time.time() + + torch_type = str_dtype_to_torch(dtype) + quant_mode = getattr(trt_model, 'quant_mode', QuantMode(0)) + if quant_mode.is_int8_weight_only(): + plugin_weight_only_quant_type = torch.int8 + elif quant_mode.is_int4_weight_only(): + plugin_weight_only_quant_type = torch.quint4x2 + use_weight_only = quant_mode.is_weight_only() + + hidden_size = hf_model.config.hidden_size + num_heads = hf_model.config.num_attention_heads + + layers_per_pipeline_stage = trt_model.num_layers // mapping.pp_size + layers_range = list( + range(mapping.pp_rank * layers_per_pipeline_stage, + (mapping.pp_rank + 1) * layers_per_pipeline_stage)) + feed_weight_count = 0 + + if model_version == "1": + num_kv_heads = hf_model.config.num_attention_heads + + if mapping.is_first_pp_rank(): + # Embedding + weight = hf_model.transformer.word_embeddings.weight.to( + torch_type).detach().cpu() + trt_model.embedding.weight.value = torch_to_numpy(weight) + feed_weight_count += 1 + if mapping.is_last_pp_rank(): + # Final normalization + weight = hf_model.transformer.final_layernorm.weight.to( + torch_type).detach().cpu() + trt_model.final_norm.weight.value = torch_to_numpy(weight) + bias = hf_model.transformer.final_layernorm.bias.to( + torch_type).detach().cpu() + trt_model.final_norm.bias.value = torch_to_numpy(bias) + feed_weight_count += 2 + + # Final LM + weight = hf_model.lm_head.weight.to(torch_type).detach().cpu() + if weight.shape[0] % mapping.tp_size != 0: + pad_width = trt_model.lm_head.out_features * mapping.tp_size - weight.shape[ + 0] + weight = F.pad(weight, (0, 0, 0, pad_width)) + split_weight = torch.chunk(weight, mapping.tp_size, + dim=0)[mapping.rank] + trt_model.lm_head.weight.value = torch_to_numpy(split_weight) + feed_weight_count += 1 + + for layer_idx in range(28): + if layer_idx not in layers_range: + continue + i = int(layer_idx) - mapping.pp_rank * layers_per_pipeline_stage + if i >= trt_model.num_layers: + continue + + # Pre normalization + weight = hf_model.transformer.layers[i].input_layernorm.weight.to( + torch_type).detach().cpu() + trt_model.layers[i].pre_norm.weight.value = torch_to_numpy(weight) + bias = hf_model.transformer.layers[i].input_layernorm.bias.to( + torch_type).detach().cpu() + trt_model.layers[i].pre_norm.bias.value = torch_to_numpy(bias) + feed_weight_count += 2 + + # QKV multiplication weight + weight = hf_model.transformer.layers[ + i].attention.query_key_value.weight.to( + torch_type).detach().cpu() + split_weight = split_qkv(weight, mapping.tp_size, mapping.tp_rank, + hidden_size, num_heads, num_kv_heads) + dst = trt_model.layers[i].attention.qkv + if use_weight_only: + load_quant_weight( + src=split_weight, + value_dst=dst.weight, + scale_dst=dst.per_channel_scale, + plugin_weight_only_quant_type=plugin_weight_only_quant_type) + else: + dst.weight.value = torch_to_numpy(split_weight) + feed_weight_count += 1 + + # QKV multiplication bias + bias = hf_model.transformer.layers[ + i].attention.query_key_value.bias.to(torch_type).detach().cpu() + split_bias = split_qkv(bias, mapping.tp_size, mapping.tp_rank, + hidden_size, num_heads, num_kv_heads) + trt_model.layers[i].attention.qkv.bias.value = torch_to_numpy( + split_bias) + feed_weight_count += 1 + + # Dense multiplication weight (no bias) + weight = hf_model.transformer.layers[i].attention.dense.weight.to( + torch_type).detach().cpu() + split_weight = torch.chunk(weight, mapping.tp_size, + dim=1)[mapping.rank] + dst = trt_model.layers[i].attention.dense + if use_weight_only: + load_quant_weight( + src=split_weight, + value_dst=dst.weight, + scale_dst=dst.per_channel_scale, + plugin_weight_only_quant_type=plugin_weight_only_quant_type) + else: + dst.weight.value = torch_to_numpy(split_weight) + feed_weight_count += 1 + + # Post normalization + weight = hf_model.transformer.layers[ + i].post_attention_layernorm.weight.to( + torch_type).detach().cpu() + trt_model.layers[i].post_norm.weight.value = torch_to_numpy(weight) + bias = hf_model.transformer.layers[ + i].post_attention_layernorm.bias.to(torch_type).detach().cpu() + trt_model.layers[i].post_norm.bias.value = torch_to_numpy(bias) + feed_weight_count += 2 + + # Multilayer perceptron h -> 4h (no bias) + weight = hf_model.transformer.layers[i].mlp.dense_h_to_4h.weight.to( + torch_type).detach().cpu() + split_weight = torch.chunk(weight, mapping.tp_size, + dim=0)[mapping.rank] + dst = trt_model.layers[i].mlp.fc + if use_weight_only: + load_quant_weight( + src=split_weight, + value_dst=dst.weight, + scale_dst=dst.per_channel_scale, + plugin_weight_only_quant_type=plugin_weight_only_quant_type) + else: + dst.weight.value = torch_to_numpy(split_weight) + feed_weight_count += 1 + + # Multilayer perceptron 4h -> h (no bias) + weight = hf_model.transformer.layers[i].mlp.dense_4h_to_h.weight.to( + torch_type).detach().cpu() + split_weight = torch.chunk(weight, mapping.tp_size, + dim=1)[mapping.rank] + dst = trt_model.layers[i].mlp.proj + if use_weight_only: + load_quant_weight( + src=split_weight, + value_dst=dst.weight, + scale_dst=dst.per_channel_scale, + plugin_weight_only_quant_type=plugin_weight_only_quant_type) + else: + dst.weight.value = torch_to_numpy(split_weight) + feed_weight_count += 1 + + assert feed_weight_count == 4 + trt_model.num_layers * 9, "Some weights not loaded from HF" + + else: + num_kv_heads = hf_model.config.multi_query_group_num + + if mapping.is_first_pp_rank(): + # Embedding + weight = hf_model.transformer.embedding.word_embeddings.weight.to( + torch_type).detach().cpu() + trt_model.embedding.weight.value = torch_to_numpy(weight) + feed_weight_count += 1 + if mapping.is_last_pp_rank(): + # Final normalization + weight = hf_model.transformer.encoder.final_layernorm.weight.to( + torch_type).detach().cpu() + trt_model.final_norm.weight.value = torch_to_numpy(weight) + feed_weight_count += 1 + + # Final LM + weight = hf_model.transformer.output_layer.weight.to( + torch_type).detach().cpu() + if weight.shape[0] % mapping.tp_size != 0: + pad_width = trt_model.lm_head.out_features * mapping.tp_size - weight.shape[ + 0] + weight = F.pad(weight, (0, 0, 0, pad_width)) + split_weight = torch.chunk(weight, mapping.tp_size, + dim=0)[mapping.rank] + trt_model.lm_head.weight.value = torch_to_numpy(split_weight) + feed_weight_count += 1 + + for layer_idx in range(28): + if layer_idx not in layers_range: + continue + i = int(layer_idx) - mapping.pp_rank * layers_per_pipeline_stage + if i >= trt_model.num_layers: + continue + + # Pre normalization + weight = hf_model.transformer.encoder.layers[ + i].input_layernorm.weight.to(torch_type).detach().cpu() + trt_model.layers[i].pre_norm.weight.value = torch_to_numpy(weight) + feed_weight_count += 1 + + # QKV multiplication weight + weight = hf_model.transformer.encoder.layers[ + i].self_attention.query_key_value.weight.to( + torch_type).detach().cpu() + split_weight = split_qkv(weight, mapping.tp_size, mapping.tp_rank, + hidden_size, num_heads, num_kv_heads) + dst = trt_model.layers[i].attention.qkv + if use_weight_only: + load_quant_weight( + src=split_weight, + value_dst=dst.weight, + scale_dst=dst.per_channel_scale, + plugin_weight_only_quant_type=plugin_weight_only_quant_type) + else: + dst.weight.value = torch_to_numpy(split_weight) + feed_weight_count += 1 + + # QKV multiplication bias + bias = hf_model.transformer.encoder.layers[ + i].self_attention.query_key_value.bias.to( + torch_type).detach().cpu() + split_bias = split_qkv(bias, mapping.tp_size, mapping.tp_rank, + hidden_size, num_heads, num_kv_heads) + trt_model.layers[i].attention.qkv.bias.value = torch_to_numpy( + split_bias) + feed_weight_count += 1 + + # Dense multiplication weight (no bias) + weight = hf_model.transformer.encoder.layers[ + i].self_attention.dense.weight.to(torch_type).detach().cpu() + split_weight = torch.chunk(weight, mapping.tp_size, + dim=1)[mapping.rank] + dst = trt_model.layers[i].attention.dense + if use_weight_only: + load_quant_weight( + src=split_weight, + value_dst=dst.weight, + scale_dst=dst.per_channel_scale, + plugin_weight_only_quant_type=plugin_weight_only_quant_type) + else: + dst.weight.value = torch_to_numpy(split_weight) + feed_weight_count += 1 + + # Post normalization + weight = hf_model.transformer.encoder.layers[ + i].post_attention_layernorm.weight.to( + torch_type).detach().cpu() + trt_model.layers[i].post_norm.weight.value = torch_to_numpy(weight) + feed_weight_count += 1 + + # Multilayer perceptron h -> 4h (no bias) + weight = hf_model.transformer.encoder.layers[ + i].mlp.dense_h_to_4h.weight.to(torch_type).detach().cpu() + split_weight = torch.chunk(weight, 2 * mapping.tp_size, dim=0) + # swap first and second half weight in columns to adapt trt_llm Swiglu + split_weight = torch.cat( + [ + split_weight[mapping.rank + mapping.tp_size], + split_weight[mapping.rank], + ], + dim=0, + ) + dst = trt_model.layers[i].mlp.fc + if use_weight_only: + load_quant_weight( + src=split_weight, + value_dst=dst.weight, + scale_dst=dst.per_channel_scale, + plugin_weight_only_quant_type=plugin_weight_only_quant_type) + else: + dst.weight.value = torch_to_numpy(split_weight) + feed_weight_count += 1 + + # Multilayer perceptron 4h -> h (no bias) + weight = hf_model.transformer.encoder.layers[ + i].mlp.dense_4h_to_h.weight.to(torch_type).detach().cpu() + split_weight = torch.chunk(weight, mapping.tp_size, + dim=1)[mapping.rank] + dst = trt_model.layers[i].mlp.proj + if use_weight_only: + load_quant_weight( + src=split_weight, + value_dst=dst.weight, + scale_dst=dst.per_channel_scale, + plugin_weight_only_quant_type=plugin_weight_only_quant_type) + else: + dst.weight.value = torch_to_numpy(split_weight) + feed_weight_count += 1 + + assert feed_weight_count == 3 + trt_model.num_layers * 7, "Some weights not loaded from HF" + + tok = time.time() + + tensorrt_llm.logger.info("Loading weights finish in %.2fs" % (tok - tik)) + return trt_model diff --git a/examples/chatglm2-6b/README.md b/examples/chatglm2-6b/README.md deleted file mode 100644 index 87eba32d88..0000000000 --- a/examples/chatglm2-6b/README.md +++ /dev/null @@ -1,90 +0,0 @@ -# ChatGLM2-6B - -This document explains how to build the [ChatGLM2-6B](https://huggingface.co/THUDM/chatglm2-6b) model using TensorRT-LLM and run on a single GPU. - -## Overview - -The TensorRT-LLM ChatGLM2-6B implementation can be found in [`tensorrt_llm/models/chatglm2_6b/model.py`](../../tensorrt_llm/models/chatglm6b/model.py). -The TensorRT-LLM ChatGLM2-6B example code is located in [`examples/chatglm2-6b`](./). There are 3 main files in that folder: - -* [`build.py`](./build.py) to build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run the ChatGLM-6B model. -* [`run.py`](./run.py) to run the inference on an input text. -* [`summarize.py`](./summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset using the model. - -## Usage - -The next section describe how to build the engine and run the inference demo. - -### 1. Prepare environment and download weights from HuggingFace Transformers - -```bash -apt-get update -apt-get install git-lfs -git clone https://huggingface.co/THUDM/chatglm2-6b pyTorchModel -``` - -### 2. Build TensorRT engine(s) - -+ This ChatGLM2-6B example in TensorRT-LLM builds TensorRT engine(s) using HF checkpoint directly (rather than using FT checkpoints such as GPT example). -+ If no checkpoint directory is specified, TensorRT-LLM will build engine(s) using dummy weights. -+ The [`build.py`](./build.py) script requires a single GPU to build the TensorRT engine(s). -+ You can enable parallel builds to accelerate the engine building process if you have more than one GPU in your system (of the same model). -+ For parallel building, add the `--parallel_build` argument to the build command (this feature cannot take advantage of more than a single node). -+ The number of TensorRT engines depends on the number of GPUs that will be used to run inference. - -#### Examples of build invocations: - -```bash -# Build a single-GPU float16 engine using FT weights. -# --use_gpt_attention_plugin must be used to deal with inputs with different length in one batch -# --use_gemm_plugin, --use_layernorm_plugin, --enable_context_fmha, --enable_context_fmha_fp32_acc are used to improve accuracy or performance. -python3 build.py --dtype float16 \ - --use_gpt_attention_plugin float16 \ - --use_gemm_plugin float16 -``` - -#### INT8 Weight Only - -+ Enable the int 8 weight-only quantization by adding `--use_weight_only`, this will siginficantly lower the latency and memory footprint. - -#### Fused MultiHead Attention (FMHA) - -+ Use `--enable_context_fmha` or `--enable_context_fmha_fp32_acc` to enable FMHA kernels, which can provide better performance and low GPU memory occupation. - -+ Switch `--use_gpt_attention_plugin float16` must be used when using FMHA. - -+ `--enable_context_fmha` uses FP16 accumulator, which might cause low accuracy. In this case, `--enable_context_fmha_fp32_acc` should be used to protect accuracy at a cost of small performance drop. - -#### In-flight batching and paged KV cache - -+ The engine must be built accordingly if [in-flight batching in C++ runtime](../../docs/in_flight_batching.md) will be used. - -+ Use `--use_inflight_batching` to enable In-flight Batching. - -+ Switch `--use_gpt_attention_plugin=float16`, `--paged_kv_cache`, `--remove_input_padding` will be set when using In-flight Batching. - -+ It is possible to use `--use_gpt_attention_plugin float32` In-flight Batching. - -+ The size of the block in paged KV cache can be conteoled additionally by using `--tokens_per_block=N`. - -### 3. Run - -#### Single node, single GPU - -Run TensorRT-LLM ChatGLM-6B model on a single GPU - -```bash -# Run the ChatGLM2-6B model on a single GPU. -python3 run.py -``` - -Run comparison of performance and accuracy - -```bash -# Run the summarization task. -python3 summarize.py -``` - -## Benchmark - -+ [TODO] The TensorRT-LLM ChatGLM2-6B benchmark is located in [benchmarks/](../../benchmarks/README.md) diff --git a/examples/chatglm2-6b/run.py b/examples/chatglm2-6b/run.py deleted file mode 100644 index 319a0b14df..0000000000 --- a/examples/chatglm2-6b/run.py +++ /dev/null @@ -1,206 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import json -import os -import re -from pathlib import Path - -import torch -import transformers - -import tensorrt_llm -from tensorrt_llm.quantization import QuantMode -from tensorrt_llm.runtime import GenerationSession, ModelConfig, SamplingConfig - -from build import find_engines # isort:skip - -MODEL_NAME = "chatglm2-6b" - - -def parse_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument('--max_output_len', type=int, default=1024) - parser.add_argument('--log_level', type=str, default='error') - parser.add_argument('--engine_dir', type=str, default='trtModel') - parser.add_argument('--beam_width', type=int, default=1) - parser.add_argument( - '--input_text', - type=str, - nargs='*', - default=[ - "What's new between ChatGLM2-6B and ChatGLM-6B?", - "Could you introduce NVIDIA Corporation for me?" - ], - ) - parser.add_argument( - '--input_tokens', - type=str, - help= - 'CSV or Numpy file containing tokenized input. Alternative to text input.', - default=None, - ) - parser.add_argument( - '--tokenizer_dir', - type=str, - default='pyTorchModel', - help='Directory containing the tokenizer model.', - ) - parser.add_argument('--temperature', type=float, default=1.0) - parser.add_argument('--top_k', type=int, default=1) - parser.add_argument('--top_p', type=float, default=0.0) - parser.add_argument('--random_seed', type=int, default=1) - return parser.parse_args() - - -def process_response(responseList): - for i, response in enumerate(responseList): - response = response.strip() - punkts = [ - [",", ","], - ["!", "!"], - [":", ":"], - [";", ";"], - ["\?", "?"], - ] - for item in punkts: - response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], - r"\1%s" % item[1], response) - response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], - r"%s\1" % item[1], response) - - responseList[i] = response - return responseList - - -if __name__ == '__main__': - args = parse_arguments() - tensorrt_llm.logger.set_level(args.log_level) - - config_path = os.path.join(args.engine_dir, 'config.json') - with open(config_path, 'r') as f: - config = json.load(f) - assert (config['builder_config']['name'] == MODEL_NAME) - dtype = config['builder_config']['precision'] - end_id = config['builder_config']['eos_token_id'] - pad_id = config['builder_config']['pad_token_id'] - max_batch_size = config['builder_config']['max_batch_size'] - use_gpt_attention_plugin = config['plugin_config']['gpt_attention_plugin'] - world_size = config['builder_config']['tensor_parallel'] - assert world_size == tensorrt_llm.mpi_world_size( - ), f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})' - - runtime_rank = tensorrt_llm.mpi_rank() - runtime_mapping = tensorrt_llm.Mapping(world_size, - runtime_rank, - tp_size=world_size) - torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node) - - serialize_path = find_engines(Path(args.engine_dir), - dtype=dtype, - tp_size=world_size, - rank=runtime_rank)[0] - - tokenizer = transformers.AutoTokenizer.from_pretrained( - args.tokenizer_dir, trust_remote_code=True) - input_ids = None - input_text = None - if args.input_tokens is None: - input_text = args.input_text[:max_batch_size] - tokenized = tokenizer(input_text, - return_tensors="pt", - padding=True, - return_length=True) - input_ids = tokenized['input_ids'].int().contiguous().cuda() - input_lengths = tokenized['length'].int().contiguous().cuda() - else: - input_ids = [] - with open(args.input_tokens) as f_in: - for line in f_in: - for e in line.strip().split(','): - input_ids.append(int(e)) - input_text = "" - input_ids = torch.tensor(input_ids, - dtype=torch.int32).cuda().unsqueeze(0) - - if use_gpt_attention_plugin: - # when using gpt attention plugin, inputs needs to align at the head - input_ids_padding_right = torch.zeros_like(input_ids) + end_id - for i, sample in enumerate(input_ids): - nPadding = 0 - for token in sample: - if token == pad_id: - nPadding += 1 - else: - break - input_ids_padding_right[ - i, :len(sample[nPadding:])] = sample[nPadding:] - input_ids = input_ids_padding_right - - model_config = ModelConfig( - vocab_size=config['builder_config']['vocab_size'], - num_layers=config['builder_config']['num_layers'], - num_heads=config['builder_config']['num_heads'] // world_size, - num_kv_heads=config['builder_config']['num_kv_heads'] // world_size, - hidden_size=config['builder_config']['hidden_size'] // world_size, - gpt_attention_plugin=use_gpt_attention_plugin, - remove_input_padding=config['builder_config']['remove_input_padding'], - model_name=MODEL_NAME, - paged_kv_cache=config['builder_config']['paged_kv_cache'], - quant_mode=QuantMode(config['builder_config']['quant_mode']), - dtype=dtype, - ) - - sampling_config = SamplingConfig( - end_id=end_id, - pad_id=pad_id, - num_beams=args.beam_width, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - ) - sampling_config.random_seed = args.random_seed - - with open(serialize_path, 'rb') as f: - engine_buffer = f.read() - decoder = GenerationSession( - model_config, - engine_buffer, - runtime_mapping, - ) - decoder.setup(input_ids.size(0), input_ids.size(1), args.max_output_len, - args.beam_width) - output_ids = decoder.decode(input_ids, input_lengths, sampling_config) - torch.cuda.synchronize() - - for i in range(len(output_ids.tolist())): - output_beams_list = [ - tokenizer.batch_decode(output_ids[batch_idx, :, - input_lengths[batch_idx]:], - skip_special_tokens=True) - for batch_idx in range(input_ids.size(0)) - ] - output_text = process_response(output_beams_list[i]) - end = torch.where(input_ids[i] == end_id)[0] - inputLength = int(end[0]) if len(end) > 0 else input_ids.shape[1] - print("\nInput %2d ---> len=%d\n%s" % (i, inputLength, input_text[i])) - print("\nOutput %2d --->" % i) - for j, simple_output in enumerate(output_text): - end = torch.where(output_ids[i, j, input_lengths[i]:] == end_id)[0] - outputLength = int(end[0]) if len(end) > 0 else args.max_output_len - print(" Beam %2d ---> len=%d\n%s" % - (j, outputLength, simple_output)) - - print("Finished!") diff --git a/examples/chatglm2-6b/summarize.py b/examples/chatglm2-6b/summarize.py deleted file mode 100644 index 1bc2f1870c..0000000000 --- a/examples/chatglm2-6b/summarize.py +++ /dev/null @@ -1,428 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import copy -import json -from pathlib import Path - -import evaluate -import numpy as np -import torch -from datasets import load_dataset -from transformers import AutoModel, AutoTokenizer - -import tensorrt_llm -import tensorrt_llm.profiler as profiler -from tensorrt_llm.logger import logger -from tensorrt_llm.runtime import GenerationSession, ModelConfig, SamplingConfig - -from build import find_engines # isort:skip - -MODEL_NAME = "chatglm2-6b" - - -def TRT(args, config): - - dtype = config['builder_config']['precision'] - world_size = config['builder_config']['tensor_parallel'] - assert world_size == tensorrt_llm.mpi_world_size(), \ - f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})' - - world_size = config['builder_config']['tensor_parallel'] - remove_input_padding = config['plugin_config']['remove_input_padding'] - - model_config = ModelConfig( - model_name=MODEL_NAME, - vocab_size=config['builder_config']['vocab_size'], - num_layers=config['builder_config']['num_layers'], - num_heads=config['builder_config']['num_heads'] // world_size, - num_kv_heads=config['builder_config']['num_heads'] // world_size, - hidden_size=config['builder_config']['hidden_size'] // world_size, - gpt_attention_plugin=bool( - config['plugin_config']['gpt_attention_plugin']), - remove_input_padding=remove_input_padding, - tokens_per_block=config['plugin_config']['tokens_per_block'], - paged_kv_cache=config['plugin_config']['paged_kv_cache'], - dtype=dtype, - use_custom_all_reduce=config['plugin_config']['use_custom_all_reduce'], - ) - - runtime_rank = tensorrt_llm.mpi_rank() - runtime_mapping = tensorrt_llm.Mapping(world_size, - runtime_rank, - tp_size=world_size) - torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node) - - serialize_path = find_engines(args.engine_dir, - dtype=dtype, - tp_size=world_size, - rank=runtime_rank)[0] - - tensorrt_llm.logger.set_level(args.log_level) - - with open(serialize_path, 'rb') as f: - engine_buffer = f.read() - decoder = GenerationSession( - model_config, - engine_buffer, - runtime_mapping, - ) - - return decoder - - -def main(args): - runtime_rank = tensorrt_llm.mpi_rank() - logger.set_level(args.log_level) - - test_hf = args.test_hf and runtime_rank == 0 # only run hf on rank 0 - test_trt_llm = args.test_trt_llm - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer, - padding_side='left', - trust_remote_code=True, - ) - - if args.eval_type == 'code_completion': - dataset_name = "openai_humaneval" - dataset_revision = None - dataset_input_key = 'prompt' - dataset_output_key = 'canonical_solution' - elif args.eval_type == 'summarize': - dataset_name = "ccdv/cnn_dailymail" - dataset_revision = "3.0.0" - dataset_input_key = 'article' - dataset_output_key = 'highlights' - args.dataset_path.mkdir(parents=True, exist_ok=True) - dataset = load_dataset(dataset_name, - dataset_revision, - cache_dir=args.dataset_path) - - config_path = str(args.engine_dir / 'config.json') - with open(config_path, 'r') as f: - config = json.load(f) - - max_batch_size = args.batch_size - - # runtime parameters - # repetition_penalty = 1 - top_k = args.top_k - output_len = args.output_len - test_token_num = 800 - # top_p = 0.0 - # random_seed = 5 - temperature = 1 - num_beams = args.num_beams - length_penalty = args.length_penalty - - pad_id = tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0] - end_id = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False)[0] - - if test_trt_llm: - tensorrt_llm_gpt = TRT(args, config) - - if test_hf: - model = AutoModel.from_pretrained( - args.hf_model_location, - trust_remote_code=True, - ) - model.cuda() - if args.data_type == 'fp16': - model.half() - - def eval_tensorrt_llm(datapoint, eval_type='summarize'): - batch_size = len(datapoint) - append_str = ' TL;DR: ' if eval_type == 'summarize' else '' - line = copy.copy(datapoint) - line_encoded = [] - input_lengths = [] - for i in range(batch_size): - line[i] = line[i] + append_str - - line[i] = line[i].strip() - line[i] = line[i].replace(" n't", "n't") - - input_id = tokenizer.encode( - line[i], - return_tensors='pt', - ).type(torch.int32) - input_id = input_id[:, :test_token_num] - - line_encoded.append(input_id) - input_lengths.append(input_id.shape[-1]) - - max_length = max(input_lengths) - - if tensorrt_llm_gpt.remove_input_padding: - line_encoded = [t.to(torch.int32).cuda() for t in line_encoded] - else: - # do padding, should move outside the profiling to prevent the overhead - for i in range(batch_size): - pad_size = max_length - input_lengths[i] - - pad = torch.ones([1, pad_size], dtype=torch.int32) * pad_id - line_encoded[i] = torch.cat( - [line_encoded[i].to(torch.int32), pad], axis=-1) - - line_encoded = torch.cat(line_encoded, axis=0).cuda() - input_lengths = torch.tensor(input_lengths, - dtype=torch.int32).cuda() - - sampling_config = SamplingConfig( - end_id=end_id, - pad_id=pad_id, - top_k=top_k, - num_beams=num_beams, - length_penalty=length_penalty, - ) - - with torch.no_grad(): - tensorrt_llm_gpt.setup(batch_size, - max_context_length=max_length, - max_new_tokens=output_len, - beam_width=num_beams) - - if tensorrt_llm_gpt.remove_input_padding: - output_ids = tensorrt_llm_gpt.decode_batch( - line_encoded, sampling_config) - else: - output_ids = tensorrt_llm_gpt.decode( - line_encoded, - input_lengths, - sampling_config, - ) - - torch.cuda.synchronize() - - # Extract a list of tensors of shape beam_width x output_ids. - if tensorrt_llm_gpt.mapping.is_first_pp_rank(): - output_beams_list = [ - tokenizer.batch_decode(output_ids[batch_idx, :, - input_lengths[batch_idx]:], - skip_special_tokens=True) - for batch_idx in range(batch_size) - ] - return output_beams_list, output_ids[:, :, max_length:].tolist() - return [], [] - - def eval_hf(datapoint, eval_type='summarize'): - batch_size = len(datapoint) - append_str = ' TL;DR: ' if eval_type == 'summarize' else '' - if batch_size > 1: - logger.warning( - f"HF does not support batch_size > 1 to verify correctness due to padding and attention mask. Current batch size is {batch_size}" - ) - - line = copy.copy(datapoint) - line_encoded = [] - input_lengths = [] - for i in range(batch_size): - line[i] = line[i] + append_str - - line[i] = line[i].strip() - line[i] = line[i].replace(" n't", "n't") - - input_id = tokenizer.encode( - line[i], - return_tensors='pt', - ).type(torch.int64) - input_id = input_id[:, :test_token_num] - - line_encoded.append(input_id) - input_lengths.append(input_id.shape[-1]) - - max_length = max(input_lengths) - - for i in range(batch_size): - pad_size = max_length - input_lengths[i] - - pad = torch.ones([1, pad_size], dtype=torch.int64) * pad_id - line_encoded[i] = torch.cat([pad, line_encoded[i].to(torch.int64)], - axis=-1) - - line_encoded = torch.cat(line_encoded, axis=0).cuda() - - with torch.no_grad(): - output = model.generate(line_encoded, - max_length=len(line_encoded[0]) + - output_len, - top_k=top_k, - temperature=temperature, - eos_token_id=tokenizer.eos_token_id, - pad_token_id=tokenizer.pad_token_id, - num_beams=num_beams, - num_return_sequences=num_beams, - early_stopping=True, - length_penalty=length_penalty) - - tokens_list = output[:, len(line_encoded[0]):].tolist() - output = output.reshape([batch_size, num_beams, -1]) - output_lines_list = [ - tokenizer.batch_decode(output[:, i, len(line_encoded[0]):], - skip_special_tokens=True) - for i in range(num_beams) - ] - - return output_lines_list, tokens_list - - if test_trt_llm: - datapoint = dataset['test'][0:1] - output, _ = eval_tensorrt_llm(datapoint[dataset_input_key], - eval_type=args.eval_type) - if runtime_rank == 0: - logger.info( - "---------------------------------------------------------") - logger.info("TensorRT-LLM Generated : ") - logger.info(f" Input : {datapoint[dataset_input_key]}") - logger.info(f"\n Reference : {datapoint[dataset_output_key]}") - logger.info(f"\n Output : {output}") - logger.info( - "---------------------------------------------------------") - - if test_hf: - datapoint = dataset['test'][0:1] - output, _ = eval_hf(datapoint[dataset_input_key], - eval_type=args.eval_type) - logger.info("---------------------------------------------------------") - logger.info("HF Generated : ") - logger.info(f" Input : {datapoint[dataset_input_key]}") - logger.info(f"\n Reference : {datapoint[dataset_output_key]}") - logger.info(f"\n Output : {output}") - logger.info("---------------------------------------------------------") - - metric_tensorrt_llm = [evaluate.load("rouge") for _ in range(num_beams)] - metric_hf = [evaluate.load("rouge") for _ in range(num_beams)] - for i in range(num_beams): - metric_tensorrt_llm[i].seed = 0 - metric_hf[i].seed = 0 - - ite_count = 0 - data_point_idx = 0 - while (data_point_idx < len(dataset['test'])) and (ite_count < - args.max_ite): - if runtime_rank == 0: - logger.debug( - f"run data_point {data_point_idx} ~ {data_point_idx + max_batch_size}" - ) - datapoint = dataset['test'][data_point_idx:(data_point_idx + - max_batch_size)] - - if test_trt_llm: - profiler.start('tensorrt_llm') - output_tensorrt_llm, _ = eval_tensorrt_llm( - datapoint[dataset_input_key]) - profiler.stop('tensorrt_llm') - - if test_hf: - profiler.start('hf') - output_hf, _ = eval_hf(datapoint[dataset_input_key]) - profiler.stop('hf') - - if runtime_rank == 0: - if test_trt_llm: - for batch_idx in range(len(output_tensorrt_llm)): - for beam_idx in range(num_beams): - metric_tensorrt_llm[beam_idx].add_batch( - predictions=[ - output_tensorrt_llm[batch_idx][beam_idx] - ], - references=[ - datapoint[dataset_output_key][batch_idx] - ]) - if test_hf: - for beam_idx in range(num_beams): - for batch_idx in range(len(output_hf[beam_idx])): - metric_hf[beam_idx].add_batch( - predictions=[output_hf[beam_idx][batch_idx]], - references=[ - datapoint[dataset_output_key][batch_idx] - ]) - - logger.debug('-' * 100) - logger.debug(f"Input : {datapoint[dataset_input_key]}") - if test_trt_llm: - logger.debug(f'TensorRT-LLM Output: {output_tensorrt_llm}') - if test_hf: - logger.debug(f'HF Output: {output_hf}') - logger.debug(f"highlights : {datapoint[dataset_output_key]}") - - data_point_idx += max_batch_size - ite_count += 1 - - if runtime_rank == 0: - if test_trt_llm: - np.random.seed(0) # rouge score use sampling to compute the score - logger.info( - f'TensorRT-LLM (total latency: {profiler.elapsed_time_in_sec("tensorrt_llm")} sec)' - ) - for beam_idx in range(num_beams): - logger.info(f"TensorRT-LLM beam {beam_idx} result") - computed_metrics_tensorrt_llm = metric_tensorrt_llm[ - beam_idx].compute() - for key in computed_metrics_tensorrt_llm.keys(): - logger.info( - f' {key} : {computed_metrics_tensorrt_llm[key] * 100}') - - if args.check_accuracy and beam_idx == 0: - assert computed_metrics_tensorrt_llm[ - 'rouge1'] * 100 > args.tensorrt_llm_rouge1_threshold - if test_hf: - np.random.seed(0) # rouge score use sampling to compute the score - logger.info( - f'Hugging Face (total latency: {profiler.elapsed_time_in_sec("hf")} sec)' - ) - for beam_idx in range(num_beams): - logger.info(f"HF beam {beam_idx} result") - computed_metrics_hf = metric_hf[beam_idx].compute() - for key in computed_metrics_hf.keys(): - logger.info(f' {key} : {computed_metrics_hf[key] * 100}') - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--hf_model_location', type=str, default='pyTorchModel') - parser.add_argument( - '--tokenizer', - default=None, - help='tokenizer path; defaults to hf_model_location if left unspecified' - ) - parser.add_argument('--test_hf', action='store_true', default=True) - parser.add_argument('--test_trt_llm', action='store_true', default=True) - parser.add_argument('--data_type', - type=str, - choices=['fp32', 'fp16'], - default='fp16') - parser.add_argument('--dataset_path', type=Path, default='dataset') - parser.add_argument('--log_level', type=str, default='info') - parser.add_argument('--engine_dir', type=Path, default='trtModel') - parser.add_argument('--batch_size', type=int, default=1) - parser.add_argument('--max_ite', type=int, default=20) - parser.add_argument('--output_len', type=int, default=100) - parser.add_argument('--check_accuracy', action='store_true', default=True) - parser.add_argument('--tensorrt_llm_rouge1_threshold', - type=float, - default=15.0) - parser.add_argument('--num_beams', type=int, default=1) - parser.add_argument('--top_k', type=int, default=1) - parser.add_argument('--eval_type', - type=str, - default='summarize', - choices=['summarize', 'code_completion']) - parser.add_argument('--length_penalty', type=float, default=1.0) - - args = parser.parse_args() - if args.tokenizer == None: - args.tokenizer = args.hf_model_location - main(args) diff --git a/examples/chatglm2-6b/weight.py b/examples/chatglm2-6b/weight.py deleted file mode 100644 index 6aa8996790..0000000000 --- a/examples/chatglm2-6b/weight.py +++ /dev/null @@ -1,131 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import time - -import numpy as np -import torch - -import tensorrt_llm -from tensorrt_llm._utils import str_dtype_to_torch, torch_to_numpy -from tensorrt_llm.quantization import QuantMode - - -def load_from_hf( - tensorrt_llm_model, - hf_model, - mapping=None, - dtype="float32", - multi_query_mode=False, -): - tensorrt_llm.logger.info("Loading weights from HF ChatGLM2-6B") - tik = time.time() - - quant_mode = getattr(tensorrt_llm_model, 'quant_mode', QuantMode(0)) - if quant_mode.is_int8_weight_only(): - plugin_weight_only_quant_type = torch.int8 - elif quant_mode.is_int4_weight_only(): - plugin_weight_only_quant_type = torch.quint4x2 - use_weight_only = quant_mode.is_weight_only() - - torch_type = str_dtype_to_torch(dtype) - tensorrt_llm_model.embedding.weight.value = torch_to_numpy( - hf_model.transformer.embedding.word_embeddings.weight.to( - torch_type).detach().cpu()) - tensorrt_llm_model.encoder.final_layernorm.weight.value = torch_to_numpy( - hf_model.transformer.encoder.final_layernorm.weight.to( - torch_type).detach().cpu()) - tensorrt_llm_model.lm_head.weight.value = torch_to_numpy( - hf_model.transformer.output_layer.weight.to(torch_type).detach().cpu()) - - def load_quant_weight(src, value_dst, scale_dst, - plugin_weight_only_quant_type): - v = np.ascontiguousarray(src.transpose()) - processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( - torch.tensor(v), plugin_weight_only_quant_type) - value_dst.value = torch_to_numpy(processed_torch_weights) - scale_dst.value = torch_to_numpy(torch_weight_scales) - - for i in range(28): - tensorrt_llm_model.encoder.layers[ - i].input_layernorm.weight.value = torch_to_numpy( - hf_model.transformer.encoder.layers[i].input_layernorm.weight. - to(torch_type).detach().cpu()) - tensorrt_llm_model.encoder.layers[ - i].post_layernorm.weight.value = torch_to_numpy( - hf_model.transformer.encoder.layers[i].post_attention_layernorm. - weight.to(torch_type).detach().cpu()) - tensorrt_llm_model.encoder.layers[ - i].self_attention.qkv.bias.value = torch_to_numpy( - hf_model.transformer.encoder.layers[i].self_attention. - query_key_value.bias.to(torch_type).detach().cpu()) - # swap first and second half weight columns to adapt trt_llm Swiglu - h_to_4h_weight = hf_model.transformer.encoder.layers[ - i].mlp.dense_h_to_4h.weight.to(torch_type).detach().cpu() - h_to_4h_weight = torch.split(h_to_4h_weight, - h_to_4h_weight.shape[0] // 2, 0) - h_to_4h_weight = torch_to_numpy(torch.concat(h_to_4h_weight[::-1], 0)) - if use_weight_only: - load_quant_weight( - src=h_to_4h_weight, - value_dst=tensorrt_llm_model.encoder.layers[i].mlp.fc.weight, - scale_dst=tensorrt_llm_model.encoder.layers[i].mlp.fc. - per_channel_scale, - plugin_weight_only_quant_type=plugin_weight_only_quant_type) - load_quant_weight( - src=torch_to_numpy( - hf_model.transformer.encoder.layers[i].mlp.dense_4h_to_h. - weight.to(torch_type).detach().cpu()), - value_dst=tensorrt_llm_model.encoder.layers[i].mlp.proj.weight, - scale_dst=tensorrt_llm_model.encoder.layers[i].mlp.proj. - per_channel_scale, - plugin_weight_only_quant_type=plugin_weight_only_quant_type) - load_quant_weight( - src=torch_to_numpy( - hf_model.transformer.encoder.layers[i].self_attention. - query_key_value.weight.to(torch_type).detach().cpu()), - value_dst=tensorrt_llm_model.encoder.layers[i].self_attention. - qkv.weight, - scale_dst=tensorrt_llm_model.encoder.layers[i].self_attention. - qkv.per_channel_scale, - plugin_weight_only_quant_type=plugin_weight_only_quant_type) - load_quant_weight( - src=torch_to_numpy( - hf_model.transformer.encoder.layers[i].self_attention.dense. - weight.to(torch_type).detach().cpu()), - value_dst=tensorrt_llm_model.encoder.layers[i].self_attention. - dense.weight, - scale_dst=tensorrt_llm_model.encoder.layers[i].self_attention. - dense.per_channel_scale, - plugin_weight_only_quant_type=plugin_weight_only_quant_type) - - else: - tensorrt_llm_model.encoder.layers[ - i].self_attention.qkv.weight.value = torch_to_numpy( - hf_model.transformer.encoder.layers[i].self_attention. - query_key_value.weight.to(torch_type).detach().cpu()) - tensorrt_llm_model.encoder.layers[ - i].self_attention.dense.weight.value = torch_to_numpy( - hf_model.transformer.encoder.layers[i].self_attention.dense. - weight.to(torch_type).detach().cpu()) - tensorrt_llm_model.encoder.layers[ - i].mlp.fc.weight.value = h_to_4h_weight - tensorrt_llm_model.encoder.layers[ - i].mlp.proj.weight.value = torch_to_numpy( - hf_model.transformer.encoder.layers[i].mlp.dense_4h_to_h. - weight.to(torch_type).detach().cpu()) - - tok = time.time() - tensorrt_llm.logger.info("Loading weights finish in %.2fs" % (tok - tik)) - return tensorrt_llm_model diff --git a/examples/chatglm6b/.gitignore b/examples/chatglm6b/.gitignore deleted file mode 100644 index baa5534912..0000000000 --- a/examples/chatglm6b/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -__pycache__/ -pyTorchModel/ -trtModel/ -dataset/ -.vscode/ diff --git a/examples/chatglm6b/README.md b/examples/chatglm6b/README.md deleted file mode 100644 index acdecc16ca..0000000000 --- a/examples/chatglm6b/README.md +++ /dev/null @@ -1,85 +0,0 @@ -# ChatGLM-6B - -This document explains how to build the [ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b) model using TensorRT-LLM and run on a single GPU - -## Overview - -The TensorRT-LLM ChatGLM-6B implementation can be found in [`tensorrt_llm/models/chatglm6b/model.py`](../../tensorrt_llm/models/chatglm6b/model.py). -The TensorRT-LLM ChatGLM-6B example code is located in [`examples/chatglm6b`](./). There are 3 main files in that folder: - -* [`build.py`](./build.py) to build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run the ChatGLM-6B model. -* [`run.py`](./run.py) to run the inference on an input text. -* [`summarize.py`](./summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset using the model. - -## Usage - -### 1. Prepare environment and download weights from HuggingFace Transformers - -```bash -pip install -r requirements.txt -apt-get update -apt-get install git-lfs -git clone https://huggingface.co/THUDM/chatglm-6b pyTorchModel -``` - -### 2. Build TensorRT engine(s) - -+ This ChatGLM-6B example in TensorRT-LLM builds TensorRT engine(s) using HF checkpoint directly (rather than using FT checkpoints such as GPT example). -+ If no checkpoint directory is specified, TensorRT-LLM will build engine(s) using dummy weights. -+ The [`build.py`](./build.py) script requires a single GPU to build the TensorRT engine(s). -+ You can enable parallel builds to accelerate the engine building process if you have more than one GPU in your system (of the same model). -+ For parallel building, add the `--parallel_build` argument to the build command (this feature cannot take advantage of more than a single node). -+ The number of TensorRT engines depends on the number of GPUs that will be used to run inference. - -#### Examples of build invocations: - -```bash -# Build a single-GPU float16 engine using FT weights. -# --use_gpt_attention_plugin must be used to deal with inputs with different length in one batch -# --use_gemm_plugin, --use_layernorm_plugin, --enable_context_fmha, --enable_context_fmha_fp32_acc are used to improve accuracy or performance. -python3 build.py --dtype float16 \ - --use_gpt_attention_plugin float16 \ - --use_gemm_plugin float16 -``` - -#### Fused MultiHead Attention (FMHA) - -+ Use `--enable_context_fmha` or `--enable_context_fmha_fp32_acc` to enable FMHA kernels, which can provide better performance and low GPU memory occupation. - -+ Switch `--use_gpt_attention_plugin float16` must be used when using FMHA. - -+ `--enable_context_fmha` uses FP16 accumulator, which might cause low accuracy. In this case, `--enable_context_fmha_fp32_acc` should be used to protect accuracy at a cost of small performance drop. - -#### In-flight batching and paged KV cache - -+ The engine must be built accordingly if [in-flight batching in C++ runtime](../../docs/in_flight_batching.md) will be used. - -+ Use `--use_inflight_batching` to enable In-flight Batching. - -+ Switch `--use_gpt_attention_plugin=float16`, `--paged_kv_cache`, `--remove_input_padding` will be set when using In-flight Batching. - -+ It is possible to use `--use_gpt_attention_plugin float32` In-flight Batching. - -+ The size of the block in paged KV cache can be conteoled additionally by using `--tokens_per_block=N`. - -### 3. Run - -#### Single node, single GPU - -Run TensorRT-LLM ChatGLM-6B model on a single GPU - -```bash -# Run the ChatGLM-6B model on a single GPU. -python3 run.py -``` - -Run comparison of performance and accuracy - -```bash -# Run the summarization task. -python3 summarize.py -``` - -## Benchmark - -+ [TODO] The TensorRT-LLM ChatGLM-6B benchmark is located in [benchmarks/](../../benchmarks/README.md) diff --git a/examples/chatglm6b/build.py b/examples/chatglm6b/build.py deleted file mode 100644 index 68301f0131..0000000000 --- a/examples/chatglm6b/build.py +++ /dev/null @@ -1,518 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import json -import time -from pathlib import Path -from typing import List - -import torch -import torch.multiprocessing as mp -import transformers -from weight import load_from_hf - -import tensorrt_llm -from tensorrt_llm.builder import Builder -from tensorrt_llm.logger import logger -from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models import (ChatGLM6BHeadModel, smooth_quantize, - weight_only_quantize) -from tensorrt_llm.network import net_guard -from tensorrt_llm.plugin.plugin import ContextFMHAType -from tensorrt_llm.quantization import QuantMode - -MODEL_NAME = "chatglm-6b" - - -def get_engine_name(model, dtype, tp_size, rank): - return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank) - - -def find_engines(dir: Path, - model_name: str = "*", - dtype: str = "*", - tp_size: str = "*", - rank: str = "*") -> List[Path]: - template = f"{model_name}_{dtype}_tp{tp_size}_rank{rank}.engine" - return list(dir.glob(template)) - - -def serialize_engine(engine, path): - logger.info(f'Serializing engine to {path}...') - tik = time.time() - with open(path, 'wb') as f: - f.write(bytearray(engine)) - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logger.info(f'Engine serialized. Total time: {t}') - - -def parse_arguments(args): - parser = argparse.ArgumentParser() - parser.add_argument('--world_size', - type=int, - default=1, - help='world size, only support tensor parallelism now') - parser.add_argument('--model_dir', type=str, default="pyTorchModel") - parser.add_argument('--dtype', - type=str, - default='float16', - choices=['float32', 'float16', 'bfloat16']) - parser.add_argument( - '--timing_cache', - type=str, - default='model.cache', - help= - 'The path of to read timing cache from, will be ignored if the file does not exist' - ) - parser.add_argument( - '--log_level', - type=str, - default='verbose', - choices=['verbose', 'info', 'warning', 'error', 'internal_error']) - parser.add_argument('--max_batch_size', type=int, default=8) - parser.add_argument('--max_input_len', type=int, default=1024) - parser.add_argument('--max_output_len', type=int, default=1024) - parser.add_argument('--max_beam_width', type=int, default=1) - parser.add_argument( - '--use_gpt_attention_plugin', - nargs='?', - const='float16', - default='float16', - choices=['float32', 'float16', 'bfloat16', False], - help= - "Activates attention plugin. You can specify the plugin dtype or leave blank to use the model dtype." - ) - parser.add_argument( - '--use_gemm_plugin', - nargs='?', - const='float16', - type=str, - default='float16', - choices=['float32', 'float16', 'bfloat16', False], - help= - "Activates GEMM plugin. You can specify the plugin dtype or leave blank to use the model dtype." - ) - parser.add_argument( - '--use_layernorm_plugin', - nargs='?', - const='float16', - type=str, - default='float16', - choices=['float32', 'float16', 'bfloat16', False], - help= - "Activates layernorm plugin. You can specify the plugin dtype or leave blank to use the model dtype.", - ) - parser.add_argument('--gather_all_token_logits', - action='store_true', - default=False) - parser.add_argument('--parallel_build', default=False, action='store_true') - parser.add_argument('--enable_context_fmha', - default=False, - action='store_true') - parser.add_argument('--enable_context_fmha_fp32_acc', - default=False, - action='store_true') - parser.add_argument('--gpus_per_node', type=int, default=8) - parser.add_argument('--builder_opt', type=int, default=None) - parser.add_argument( - '--output_dir', - type=Path, - default='trtModel', - help= - 'The path to save the serialized engine files, timing cache file and model configs' - ) - parser.add_argument('--remove_input_padding', - default=False, - action='store_true') - parser.add_argument( - '--use_inflight_batching', - action="store_true", - default=False, - help="Activates inflight batching mode of gptAttentionPlugin.") - - # Arguments related to the quantization of the model. - parser.add_argument( - '--use_smooth_quant', - default=False, - action="store_true", - help= - 'Use the SmoothQuant method to quantize activations and weights for the various GEMMs.' - 'See --per_channel and --per_token for finer-grained quantization options.' - ) - parser.add_argument( - '--use_weight_only', - default=False, - action="store_true", - help='Quantize weights for the various GEMMs to INT4/INT8.' - 'See --weight_only_precision to set the precision') - parser.add_argument( - '--weight_only_precision', - const='int8', - type=str, - nargs='?', - default='int8', - choices=['int8', 'int4'], - help= - 'Define the precision for the weights when using weight-only quantization.' - 'You must also use --use_weight_only for that argument to have an impact.' - ) - parser.add_argument( - '--per_channel', - default=False, - action="store_true", - help= - 'By default, we use a single static scaling factor for the GEMM\'s result. ' - 'per_channel instead uses a different static scaling factor for each channel. ' - 'The latter is usually more accurate, but a little slower.') - parser.add_argument( - '--per_token', - default=False, - action="store_true", - help= - 'By default, we use a single static scaling factor to scale activations in the int8 range. ' - 'per_token chooses at run time, and for each token, a custom scaling factor. ' - 'The latter is usually more accurate, but a little slower.') - parser.add_argument( - '--int8_kv_cache', - default=False, - action="store_true", - help= - 'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV' - ) - parser.add_argument( - '--random_seed', - type=int, - default=None, - help= - 'Seed to use when initializing the random number generator for torch.') - parser.add_argument( - '--paged_kv_cache', - action="store_true", - default=False, - help= - 'By default we use contiguous KV cache. By setting this flag you enable paged KV cache' - ) - parser.add_argument('--tokens_per_block', - type=int, - default=64, - help='Number of tokens per block in paged KV cache') - - parser.add_argument( - '--enable_fp8', - default=False, - action='store_true', - ) - parser.add_argument( - '--fp8_kv_cache', - default=False, - action="store_true", - help= - 'By default, we use dtype for KV cache. fp8_kv_cache chooses fp8 quantization for KV' - ) - parser.add_argument( - '--max_num_tokens', - type=int, - default=None, - help='Define the max number of tokens supported by the engine') - parser.add_argument( - '--strongly_typed', - default=False, - action="store_true", - help= - 'This option is introduced with trt 9.1.0.1+ and will reduce the building time significantly for fp8.' - ) - parser.add_argument( - '--use_custom_all_reduce', - action='store_true', - help= - 'Activates latency-optimized algorithm for all-reduce instead of NCCL.') - args = parser.parse_args(args) - logger.set_level(args.log_level) - - args.apply_query_key_layer_scaling = False # always False in TRT-LLM - args.bias = True - args.hidden_act = 'gelu' - args.multi_block_mode = False - args.multi_query_mode = False # always False in ChatGLM-6B - - plugins_args = [ - 'use_gpt_attention_plugin', - 'use_gemm_plugin', - 'use_layernorm_plugin', - ] - - for plugin_arg in plugins_args: - if getattr(args, plugin_arg) is None: - logger.info( - f"{plugin_arg} set, without specifying a value. Using {args.dtype} automatically." - ) - setattr(args, plugin_arg, args.dtype) - - assert args.model_dir is not None - with open(Path(args.model_dir) / "config.json", "r") as f: - js = json.loads(f.read()) - assert js["_name_or_path"] == "THUDM/" + MODEL_NAME - assert args.max_input_len < js["max_sequence_length"] - - args.eos_token_id = js["eos_token_id"] - args.ffn_hidden_size = js["inner_hidden_size"] - args.gmask_token_id = js["gmask_token_id"] - args.hidden_size = js["hidden_size"] - args.layernorm_epsilon = js["layernorm_epsilon"] - args.mask_token_id = js["mask_token_id"] - args.max_seq_length = min(args.max_input_len + args.max_output_len, - js["max_sequence_length"]) - args.num_heads = js["num_attention_heads"] - args.num_kv_heads = js["num_attention_heads"] - args.num_layers = js["num_layers"] - args.pad_token_id = js["pad_token_id"] - args.use_cache = js["use_cache"] - args.vocab_size = js["vocab_size"] - - if args.use_inflight_batching: - if not args.use_gpt_attention_plugin: - args.use_gpt_attention_plugin = 'float16' - logger.info( - f"Using GPT attention plugin for inflight batching mode. Setting to default '{args.use_gpt_attention_plugin}'" - ) - if not args.remove_input_padding: - args.remove_input_padding = True - logger.info( - "Using remove input padding for inflight batching mode.") - if not args.paged_kv_cache: - args.paged_kv_cache = True - logger.info("Using paged KV cache for inflight batching mode.") - - assert not ( - args.use_smooth_quant and args.use_weight_only - ), "You cannot enable both SmoothQuant and INT8 weight-only together." - - if args.use_smooth_quant: - args.quant_mode = QuantMode.use_smooth_quant(args.per_token, - args.per_channel) - elif args.use_weight_only: - args.quant_mode = QuantMode.use_weight_only( - args.weight_only_precision == 'int4') - else: - args.quant_mode = QuantMode(0) - - if args.int8_kv_cache: - args.quant_mode = args.quant_mode.set_int8_kv_cache() - - if args.fp8_kv_cache: - assert ( - args.use_gpt_attention_plugin or args.use_inflight_batching - ), "You have to use GPT attention plugin when fp8 KV cache is set" - args.quant_mode = args.quant_mode.set_fp8_kv_cache() - - if args.enable_fp8: - args.quant_mode = args.quant_mode.set_fp8_qdq() - - if args.max_num_tokens is not None: - assert args.enable_context_fmha - - return args - - -def build_rank_engine(builder: Builder, - builder_config: tensorrt_llm.builder.BuilderConfig, - engine_name, rank, args): - ''' - @brief: Build the engine on the given rank. - @param rank: The rank to build the engine. - @param args: The cmd line arguments. - @return: The built engine. - ''' - - # Initialize Module - args.mapping = Mapping( - world_size=args.world_size, - rank=rank, - tp_size=args.world_size, - ) - trtllm_model = ChatGLM6BHeadModel(args=args) - - if args.use_smooth_quant: - trtllm_model = smooth_quantize(trtllm_model, args.quant_mode) - elif args.use_weight_only: - trtllm_model = weight_only_quantize(trtllm_model, args.quant_mode) - - if args.model_dir is not None: - hf_model = transformers.AutoModel.from_pretrained( - args.model_dir, trust_remote_code=True).cpu() - trtllm_model = load_from_hf( - trtllm_model, - hf_model, - mapping=args.mapping, - dtype=args.dtype, - max_seq_length=args.max_seq_length, - ) - del hf_model - - # Module -> Network - network = builder.create_network() - network.trt_network.name = engine_name - if args.use_gpt_attention_plugin: - network.plugin_config.set_gpt_attention_plugin( - dtype=args.use_gpt_attention_plugin) - if args.use_gemm_plugin: - network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin) - if args.use_layernorm_plugin: - network.plugin_config.set_layernorm_plugin( - dtype=args.use_layernorm_plugin) - assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc) - if args.enable_context_fmha: - network.plugin_config.set_context_fmha(ContextFMHAType.enabled) - if args.enable_context_fmha_fp32_acc: - network.plugin_config.set_context_fmha( - ContextFMHAType.enabled_with_fp32_acc) - if args.remove_input_padding: - network.plugin_config.enable_remove_input_padding() - if args.paged_kv_cache: - network.plugin_config.enable_paged_kv_cache(args.tokens_per_block) - - # Quantization plugins. - if args.use_smooth_quant: - network.plugin_config.set_smooth_quant_gemm_plugin(dtype=args.dtype) - network.plugin_config.set_layernorm_quantization_plugin( - dtype=args.dtype) - - network.plugin_config.set_quantize_tensor_plugin() - network.plugin_config.set_quantize_per_token_plugin() - elif args.use_weight_only: - network.plugin_config.set_weight_only_quant_matmul_plugin( - dtype=args.dtype) - - if args.world_size > 1: - network.plugin_config.set_nccl_plugin(args.dtype, - args.use_custom_all_reduce) - - with net_guard(network): - # Prepare - network.set_named_parameters(trtllm_model.named_parameters()) - - # Forward - inputs = trtllm_model.prepare_inputs( - max_batch_size=args.max_batch_size, - max_input_len=args.max_input_len, - max_new_tokens=args.max_output_len, - use_cache=True, - max_beam_width=args.max_beam_width, - ) - trtllm_model(*inputs) - - tensorrt_llm.graph_rewriting.optimize(network) - - engine = None - - # Network -> Engine - engine = builder.build_engine(network, builder_config) - if rank == 0: - config_path = args.output_dir / 'config.json' - builder.save_config(builder_config, config_path) - return engine - - -def build(rank, args): - torch.cuda.set_device(rank % args.gpus_per_node) - tensorrt_llm.logger.set_level(args.log_level) - args.output_dir.mkdir(parents=True, exist_ok=True) - timing_cache_file = args.output_dir / "model.cache" - timing_cache = timing_cache_file - - builder = Builder() - - for cur_rank in range(args.world_size): - # skip other ranks if parallel_build is enabled - if args.parallel_build and cur_rank != rank: - continue - builder_config = builder.create_builder_config( - precision=args.dtype, - timing_cache=timing_cache, - tensor_parallel=args.world_size, - int8=(args.quant_mode.has_act_or_weight_quant() - or args.quant_mode.has_int8_kv_cache()), - fp8=args.enable_fp8, - strongly_typed=args.strongly_typed, - opt_level=args.builder_opt, - apply_query_key_layer_scaling=args.apply_query_key_layer_scaling, - eos_token_id=args.eos_token_id, - gather_all_token_logits=args.gather_all_token_logits, - hidden_act=args.hidden_act, - hidden_size=args.hidden_size, - max_batch_size=args.max_batch_size, - max_input_len=args.max_input_len, - max_num_tokens=args.max_output_len + args.max_input_len, - max_output_len=args.max_output_len, - max_position_embeddings=args.max_seq_length, - multi_query_mode=args.multi_query_mode, - name=MODEL_NAME, - num_heads=args.num_heads, - num_kv_heads=args.num_heads, - num_layers=args.num_layers, - pad_token_id=args.pad_token_id, - paged_kv_cache=args.paged_kv_cache, - parallel_build=args.parallel_build, - quant_mode=int(args.quant_mode), - remove_input_padding=args.remove_input_padding, - vocab_size=args.vocab_size, - ) - - engine_name = get_engine_name(MODEL_NAME, args.dtype, args.world_size, - cur_rank) - engine = build_rank_engine(builder, builder_config, engine_name, - cur_rank, args) - assert engine is not None, f'Failed to build engine for rank {cur_rank}' - - if cur_rank == 0: - # Use in-memory timing cache for multiple builder passes. - if not args.parallel_build: - timing_cache = builder_config.trt_builder_config.get_timing_cache( - ) - - serialize_engine(engine, args.output_dir / engine_name) - - if rank == 0: - ok = builder.save_timing_cache(builder_config, timing_cache_file) - assert ok, "Failed to save timing cache." - - -def run_build(args=None): - args = parse_arguments(args) - - if args.random_seed is not None: - torch.manual_seed(args.random_seed) - - logger.set_level(args.log_level) - tik = time.time() - if args.parallel_build and args.world_size > 1 and \ - torch.cuda.device_count() >= args.world_size: - logger.warning( - f'Parallelly build TensorRT engines. Please make sure that all of the {args.world_size} GPUs are totally free.' - ) - mp.spawn(build, nprocs=args.world_size, args=(args, )) - else: - args.parallel_build = False - logger.info('Serially build TensorRT engines.') - build(0, args) - - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logger.info(f'Total time of building all {args.world_size} engines: {t}') - - -if __name__ == '__main__': - run_build() diff --git a/examples/chatglm6b/requirements.txt b/examples/chatglm6b/requirements.txt deleted file mode 100644 index 140929584e..0000000000 --- a/examples/chatglm6b/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -datasets~=2.14.5 -evaluate -protobuf -rouge_score~=0.1.2 -sentencepiece diff --git a/examples/chatglm6b/weight.py b/examples/chatglm6b/weight.py deleted file mode 100644 index a114be676e..0000000000 --- a/examples/chatglm6b/weight.py +++ /dev/null @@ -1,133 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import time - -import numpy as np -import torch - -import tensorrt_llm -from tensorrt_llm._utils import str_dtype_to_torch, torch_to_numpy -from tensorrt_llm.quantization import QuantMode - - -def load_from_hf( - tensorrt_llm_model, - hf_model, - mapping=None, - dtype="float32", - max_seq_length=2048, - multi_query_mode=False, -): - tensorrt_llm.logger.info("Loading weights from HF ChatGLM-6B") - tik = time.time() - - quant_mode = getattr(tensorrt_llm_model, 'quant_mode', QuantMode(0)) - if quant_mode.is_int8_weight_only(): - plugin_weight_only_quant_type = torch.int8 - elif quant_mode.is_int4_weight_only(): - plugin_weight_only_quant_type = torch.quint4x2 - use_weight_only = quant_mode.is_weight_only() - - torch_type = str_dtype_to_torch(dtype) - tensorrt_llm_model.embedding.weight.value = torch_to_numpy( - hf_model.transformer.word_embeddings.weight.to( - torch_type).detach().cpu()) - tensorrt_llm_model.final_layernorm.weight.value = torch_to_numpy( - hf_model.transformer.final_layernorm.weight.to( - torch_type).detach().cpu()) - tensorrt_llm_model.final_layernorm.bias.value = torch_to_numpy( - hf_model.transformer.final_layernorm.bias.to(torch_type).detach().cpu()) - tensorrt_llm_model.lm_head.weight.value = torch_to_numpy( - hf_model.lm_head.weight.to(torch_type).detach().cpu()) - - def load_quant_weight(src, value_dst, scale_dst, - plugin_weight_only_quant_type): - v = np.ascontiguousarray(src.transpose()) - processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( - torch.tensor(v), plugin_weight_only_quant_type) - value_dst.value = torch_to_numpy(processed_torch_weights) - scale_dst.value = torch_to_numpy(torch_weight_scales) - - for i in range(28): - tensorrt_llm_model.layers[ - i].input_layernorm.weight.value = torch_to_numpy( - hf_model.transformer.layers[i].input_layernorm.weight.to( - torch_type).detach().cpu()) - tensorrt_llm_model.layers[ - i].input_layernorm.bias.value = torch_to_numpy( - hf_model.transformer.layers[i].input_layernorm.bias.to( - torch_type).detach().cpu()) - tensorrt_llm_model.layers[ - i].post_layernorm.weight.value = torch_to_numpy( - hf_model.transformer.layers[i].post_attention_layernorm.weight. - to(torch_type).detach().cpu()) - tensorrt_llm_model.layers[i].post_layernorm.bias.value = torch_to_numpy( - hf_model.transformer.layers[i].post_attention_layernorm.bias.to( - torch_type).detach().cpu()) - tensorrt_llm_model.layers[i].attention.qkv.bias.value = torch_to_numpy( - hf_model.transformer.layers[i].attention.query_key_value.bias.to( - torch_type).detach().cpu()) - if use_weight_only: - load_quant_weight( - src=torch_to_numpy( - hf_model.transformer.layers[i].mlp.dense_h_to_4h.weight.to( - torch_type).detach().cpu()), - value_dst=tensorrt_llm_model.layers[i].mlp.fc.weight, - scale_dst=tensorrt_llm_model.layers[i].mlp.fc.per_channel_scale, - plugin_weight_only_quant_type=plugin_weight_only_quant_type) - load_quant_weight( - src=torch_to_numpy( - hf_model.transformer.layers[i].mlp.dense_4h_to_h.weight.to( - torch_type).detach().cpu()), - value_dst=tensorrt_llm_model.layers[i].mlp.proj.weight, - scale_dst=tensorrt_llm_model.layers[i].mlp.proj. - per_channel_scale, - plugin_weight_only_quant_type=plugin_weight_only_quant_type) - load_quant_weight( - src=torch_to_numpy( - hf_model.transformer.layers[i].attention.query_key_value. - weight.to(torch_type).detach().cpu()), - value_dst=tensorrt_llm_model.layers[i].attention.qkv.weight, - scale_dst=tensorrt_llm_model.layers[i].attention.qkv. - per_channel_scale, - plugin_weight_only_quant_type=plugin_weight_only_quant_type) - load_quant_weight( - src=torch_to_numpy( - hf_model.transformer.layers[i].attention.dense.weight.to( - torch_type).detach().cpu()), - value_dst=tensorrt_llm_model.layers[i].attention.dense.weight, - scale_dst=tensorrt_llm_model.layers[i].attention.dense. - per_channel_scale, - plugin_weight_only_quant_type=plugin_weight_only_quant_type) - - else: - tensorrt_llm_model.layers[ - i].attention.qkv.weight.value = torch_to_numpy( - hf_model.transformer.layers[i].attention.query_key_value. - weight.to(torch_type).detach().cpu()) - tensorrt_llm_model.layers[ - i].attention.dense.weight.value = torch_to_numpy( - hf_model.transformer.layers[i].attention.dense.weight.to( - torch_type).detach().cpu()) - tensorrt_llm_model.layers[i].mlp.fc.weight.value = torch_to_numpy( - hf_model.transformer.layers[i].mlp.dense_h_to_4h.weight.to( - torch_type).detach().cpu()) - tensorrt_llm_model.layers[i].mlp.proj.weight.value = torch_to_numpy( - hf_model.transformer.layers[i].mlp.dense_4h_to_h.weight.to( - torch_type).detach().cpu()) - - tok = time.time() - tensorrt_llm.logger.info("Loading weights finish in %.2fs" % (tok - tik)) - return tensorrt_llm_model diff --git a/examples/enc_dec/build.py b/examples/enc_dec/build.py index 9668e3c252..ce8a537a51 100644 --- a/examples/enc_dec/build.py +++ b/examples/enc_dec/build.py @@ -286,6 +286,9 @@ def build_rank_engine(builder: Builder, if rank == 0: config_path = args.output_dir / args.component / 'config.json' builder.save_config(builder_config, config_path) + + tensorrt_llm.tools.cleanup(network, tllm_model) + return engine diff --git a/examples/falcon/build.py b/examples/falcon/build.py index 689711639a..fd3f4159f0 100644 --- a/examples/falcon/build.py +++ b/examples/falcon/build.py @@ -30,7 +30,7 @@ from tensorrt_llm._utils import str_dtype_to_trt from tensorrt_llm.builder import Builder from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models import fp8_quantize +from tensorrt_llm.models import quantize_model from tensorrt_llm.network import net_guard from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.quantization import QuantMode @@ -412,9 +412,9 @@ def build_rank_engine(builder: Builder, quant_scales = get_scaling_factors(args.quantized_fp8_model_path, num_layers=args.n_layer, quant_mode=args.quant_mode) - tensorrt_llm_falcon = fp8_quantize(tensorrt_llm_falcon, - quant_mode=args.quant_mode, - quant_scales=quant_scales) + tensorrt_llm_falcon = quantize_model(tensorrt_llm_falcon, + quant_mode=args.quant_mode, + quant_scales=quant_scales) if args.model_dir is not None: logger.info(f'Loading HF Falcon ... from {args.model_dir}') tik = time.time() @@ -497,6 +497,9 @@ def build_rank_engine(builder: Builder, if rank == 0: config_path = os.path.join(args.output_dir, 'config.json') builder.save_config(builder_config, config_path) + + tensorrt_llm.tools.cleanup(network, tensorrt_llm_falcon) + return engine @@ -532,7 +535,6 @@ def build(rank, args): max_input_len=args.max_input_len, max_output_len=args.max_output_len, max_num_tokens=args.max_num_tokens, - fp8=args.quant_mode.has_fp8_qdq(), quant_mode=args.quant_mode, strongly_typed=args.strongly_typed, opt_level=args.builder_opt) @@ -549,6 +551,7 @@ def build(rank, args): cache = builder_config.trt_builder_config.get_timing_cache() serialize_engine(engine, os.path.join(args.output_dir, engine_name)) + del engine if rank == 0: ok = builder.save_timing_cache( diff --git a/examples/falcon/requirements.txt b/examples/falcon/requirements.txt index 7f49ed9a00..edaad3be36 100644 --- a/examples/falcon/requirements.txt +++ b/examples/falcon/requirements.txt @@ -1,5 +1,5 @@ transformers>=4.31.0 -datasets~=2.3.2 +datasets~=2.14.5 rouge_score~=0.1.2 sentencepiece~=0.1.99 typing-extensions==4.5.0 diff --git a/examples/gpt/build.py b/examples/gpt/build.py index 37f3009524..36c2e24c97 100644 --- a/examples/gpt/build.py +++ b/examples/gpt/build.py @@ -26,7 +26,7 @@ from tensorrt_llm.builder import Builder from tensorrt_llm.layers import PositionEmbeddingType from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models import smooth_quantize, weight_only_quantize +from tensorrt_llm.models import quantize_model from tensorrt_llm.network import net_guard from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.quantization import QuantMode @@ -433,11 +433,9 @@ def build_rank_engine(builder: Builder, use_parallel_embedding=args.use_parallel_embedding, embedding_sharding_dim=args.embedding_sharding_dim, share_embedding_table=share_embedding_table) - if args.use_smooth_quant: - tensorrt_llm_gpt = smooth_quantize(tensorrt_llm_gpt, args.quant_mode) - elif args.use_weight_only: - tensorrt_llm_gpt = weight_only_quantize(tensorrt_llm_gpt, - args.quant_mode) + + if args.use_smooth_quant or args.use_weight_only: + tensorrt_llm_gpt = quantize_model(tensorrt_llm_gpt, args.quant_mode) if args.model_dir is not None: gpt_dummy_fp8_scaling_factors = { @@ -534,6 +532,9 @@ def build_rank_engine(builder: Builder, if rank == 0: config_path = args.output_dir / 'config.json' builder.save_config(builder_config, config_path) + + tensorrt_llm.tools.cleanup(network, tensorrt_llm_gpt) + return engine @@ -576,9 +577,9 @@ def build(rank, args): opt_level=args.builder_opt, multi_query_mode=args.multi_query_mode, strongly_typed=args.strongly_typed, - use_prompt_tuning=args.max_prompt_embedding_table_size > 0, + max_prompt_embedding_table_size=args. + max_prompt_embedding_table_size, gather_all_token_logits=args.gather_all_token_logits, - fp8=args.enable_fp8, quant_mode=args.quant_mode, use_parallel_embedding=args.use_parallel_embedding) @@ -595,6 +596,7 @@ def build(rank, args): ) serialize_engine(engine, args.output_dir / engine_name) + del engine if rank == 0: ok = builder.save_timing_cache(builder_config, timing_cache_file) diff --git a/examples/gpt/nemo_prompt_convert.py b/examples/gpt/nemo_prompt_convert.py index ed18fe4ff6..9b16297470 100755 --- a/examples/gpt/nemo_prompt_convert.py +++ b/examples/gpt/nemo_prompt_convert.py @@ -32,22 +32,30 @@ logging.basicConfig(format=log_format) LOGGER = logging.getLogger(__name__) -def prompt_convert(args, prompt_config, prompt_weights): - prompt_templates = prompt_config["task_templates"] +def prompt_convert(out_file, prompt_config, prompt_weights): + nemo_type = "peft_tuning" if "peft" in prompt_config else "prompt_learning" - actual_task_id = 0 vtokens_embeddings = [] vtokens_len = [] - for task_name_id, prompt_task in enumerate(prompt_templates): - prompt_task_name = prompt_task["taskname"] - LOGGER.info(f"Task {actual_task_id}: {prompt_task['taskname']}") - prompt_task_weights = prompt_weights["prompt_table"].get( - f"prompt_table.{prompt_task_name}.prompt_embeddings.weight") - if prompt_task_weights is None: - continue + + if nemo_type == "peft_tuning": + prompt_task_weights = prompt_weights[ + "model.embedding.adapter_layer.ptuning_adapter.inference_table"] vtokens_embeddings.append(prompt_task_weights) vtokens_len.append(prompt_task_weights.shape[0]) - actual_task_id += 1 + else: + prompt_templates = prompt_config["task_templates"] + actual_task_id = 0 + for task_name_id, prompt_task in enumerate(prompt_templates): + prompt_task_name = prompt_task["taskname"] + LOGGER.info(f"Task {actual_task_id}: {prompt_task['taskname']}") + prompt_task_weights = prompt_weights["prompt_table"].get( + f"prompt_table.{prompt_task_name}.prompt_embeddings.weight") + if prompt_task_weights is None: + continue + vtokens_embeddings.append(prompt_task_weights) + vtokens_len.append(prompt_task_weights.shape[0]) + actual_task_id += 1 max_vtoken_len = max(vtokens_len) embedding_dim = vtokens_embeddings[0].shape[1] @@ -59,7 +67,7 @@ def prompt_convert(args, prompt_config, prompt_weights): vtokens_embeddings[i] = padded_table vtokens_embeddings = torch.stack(vtokens_embeddings) - np.save(args.out_file, torch_to_numpy(vtokens_embeddings)) + np.save(out_file, torch_to_numpy(vtokens_embeddings)) def main(args): @@ -84,7 +92,7 @@ def main(args): weight_path, map_location=cpu_map_location, ) - prompt_convert(args, prompt_config, prompt_weights) + prompt_convert(args.out_file, prompt_config, prompt_weights) LOGGER.info("Spent %s (h:m:s) to convert the prompt model", datetime.datetime.now() - start_time) diff --git a/examples/gpt/requirements.txt b/examples/gpt/requirements.txt index 61be4accb8..f46bff3100 100644 --- a/examples/gpt/requirements.txt +++ b/examples/gpt/requirements.txt @@ -1,2 +1,2 @@ -datasets~=2.3.2 +datasets~=2.14.5 rouge_score~=0.1.2 diff --git a/examples/gpt/run.py b/examples/gpt/run.py index 0b5a87f6c6..f94b7acc31 100644 --- a/examples/gpt/run.py +++ b/examples/gpt/run.py @@ -48,27 +48,29 @@ def read_config(config_path: Path): num_layers = config['builder_config']['num_layers'] paged_kv_cache = config['plugin_config']['paged_kv_cache'] tokens_per_block = config['plugin_config']['tokens_per_block'] - use_prompt_tuning = config['builder_config']['use_prompt_tuning'] + max_prompt_embedding_table_size = config['builder_config'][ + 'max_prompt_embedding_table_size'] dtype = config['builder_config']['precision'] gather_all_token_logits = config['builder_config'][ 'gather_all_token_logits'] use_custom_all_reduce = config['plugin_config']['use_custom_all_reduce'] quant_mode = QuantMode(config['builder_config']['quant_mode']) - model_config = ModelConfig(num_heads=num_heads, - num_kv_heads=num_kv_heads, - hidden_size=hidden_size, - vocab_size=vocab_size, - num_layers=num_layers, - gpt_attention_plugin=use_gpt_attention_plugin, - remove_input_padding=remove_input_padding, - paged_kv_cache=paged_kv_cache, - tokens_per_block=tokens_per_block, - use_prompt_tuning=use_prompt_tuning, - dtype=dtype, - quant_mode=quant_mode, - gather_all_token_logits=gather_all_token_logits, - use_custom_all_reduce=use_custom_all_reduce) + model_config = ModelConfig( + num_heads=num_heads, + num_kv_heads=num_kv_heads, + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + gpt_attention_plugin=use_gpt_attention_plugin, + remove_input_padding=remove_input_padding, + paged_kv_cache=paged_kv_cache, + tokens_per_block=tokens_per_block, + max_prompt_embedding_table_size=max_prompt_embedding_table_size, + dtype=dtype, + quant_mode=quant_mode, + gather_all_token_logits=gather_all_token_logits, + use_custom_all_reduce=use_custom_all_reduce) dtype = config['builder_config']['precision'] max_input_len = config['builder_config']['max_input_len'] @@ -290,7 +292,7 @@ def generate( max_output_len, beam_width=num_beams) - ptuning_args = [] if not model_config.use_prompt_tuning else ptuning_setup( + ptuning_args = [] if model_config.max_prompt_embedding_table_size == 0 else ptuning_setup( prompt_table, dtype, model_config.hidden_size, tasks, input_ids, input_lengths, model_config.remove_input_padding) diff --git a/examples/gpt/summarize.py b/examples/gpt/summarize.py index 4467f3ce3d..5182e7ab18 100644 --- a/examples/gpt/summarize.py +++ b/examples/gpt/summarize.py @@ -26,6 +26,7 @@ import tensorrt_llm import tensorrt_llm.profiler as profiler from tensorrt_llm.logger import logger from tensorrt_llm.quantization import QuantMode +from tensorrt_llm.tools.ppl import ppl from build import find_engines # isort:skip @@ -48,6 +49,11 @@ def TRTGPT(args, config): num_kv_heads = 1 if multi_query_mode else num_heads paged_kv_cache = config['plugin_config']['paged_kv_cache'] tokens_per_block = config['plugin_config']['tokens_per_block'] + gather_all_token_logits = config['builder_config'].get( + 'gather_all_token_logits', False) + assert not (args.eval_ppl and not gather_all_token_logits), \ + "PPL evaluation requires engine built with gather_all_token_logits enabled" + use_custom_all_reduce = config['plugin_config']['use_custom_all_reduce'] quant_mode = QuantMode(config['builder_config'].get('quant_mode', 0)) @@ -63,6 +69,7 @@ def TRTGPT(args, config): paged_kv_cache=paged_kv_cache, dtype=dtype, quant_mode=quant_mode, + gather_all_token_logits=gather_all_token_logits, use_custom_all_reduce=use_custom_all_reduce, ) @@ -203,27 +210,71 @@ def main(args): beam_width=num_beams) if tensorrt_llm_gpt.remove_input_padding: - output_ids = tensorrt_llm_gpt.decode_batch( - line_encoded, sampling_config) - else: - output_ids = tensorrt_llm_gpt.decode( + outputs = tensorrt_llm_gpt.decode_batch( line_encoded, - input_lengths, sampling_config, - ) - + output_sequence_lengths=True, + return_dict=True) + else: + outputs = tensorrt_llm_gpt.decode(line_encoded, + input_lengths, + sampling_config, + output_sequence_lengths=True, + return_dict=True) torch.cuda.synchronize() # Extract a list of tensors of shape beam_width x output_ids. if tensorrt_llm_gpt.mapping.is_first_pp_rank(): + output_ids = outputs['output_ids'] output_beams_list = [ tokenizer.batch_decode(output_ids[batch_idx, :, input_lengths[batch_idx]:], skip_special_tokens=True) for batch_idx in range(batch_size) ] - return output_beams_list, output_ids[:, :, max_length:].tolist() - return [], [] + + ppls = [] + if args.eval_ppl: + seq_lens = outputs['sequence_lengths'] + context_logits = outputs['context_logits'] + if tensorrt_llm_gpt.remove_input_padding: + context_logits = context_logits.flatten(end_dim=1) + seg_points = [0] + np.cumsum(input_lengths).tolist() + context_logits = [ + context_logits[s:e] + for s, e in zip(seg_points[:-1], seg_points[1:]) + ] + else: + context_logits = [ + context_logits[bidx, :input_lengths[bidx]] + for bidx in range(batch_size) + ] + + # Remove the first generation logits which are same to last context logits + # Step dim at 1 + generation_logits = torch.stack( + outputs['generation_logits'][1:], dim=1) + for bidx in range(batch_size): + # [batch, beam, step] + curr_len = seq_lens[bidx, 0] + curr_ctx_len = input_lengths[bidx] + curr_gen_len = curr_len - curr_ctx_len + + curr_ids = output_ids[bidx, 0, 1:curr_len] + curr_logits = torch.cat([ + context_logits[bidx], + generation_logits[bidx, :curr_gen_len - 1] + ], + dim=0) + curr_ppl = ppl(curr_logits, curr_ids) + ppls.append(curr_ppl) + logger.debug( + f"TensorRT-LLM PPL: {curr_ppl:.3f} | Generation length: {curr_gen_len}" + ) + + return output_beams_list, output_ids[:, :, + max_length:].tolist(), ppls + return [], [], [] def eval_hf(datapoint, eval_type='summarize'): batch_size = len(datapoint) @@ -264,32 +315,63 @@ def main(args): line_encoded = torch.cat(line_encoded, axis=0).cuda() with torch.no_grad(): - output = model.generate(line_encoded, - max_length=len(line_encoded[0]) + - output_len, - top_k=top_k, - temperature=temperature, - eos_token_id=tokenizer.eos_token_id, - pad_token_id=tokenizer.pad_token_id, - num_beams=num_beams, - num_return_sequences=num_beams, - early_stopping=True, - length_penalty=length_penalty) + outputs = model.generate(line_encoded, + max_length=len(line_encoded[0]) + + output_len, + top_k=top_k, + temperature=temperature, + eos_token_id=tokenizer.eos_token_id, + pad_token_id=tokenizer.pad_token_id, + num_beams=num_beams, + num_return_sequences=num_beams, + early_stopping=True, + length_penalty=length_penalty, + output_scores=True, + return_dict_in_generate=True) + # model.generate cannot return context logits? + context_outputs = model(line_encoded) - tokens_list = output[:, len(line_encoded[0]):].tolist() - output = output.reshape([batch_size, num_beams, -1]) + output_ids = outputs['sequences'] + tokens_list = output_ids[:, len(line_encoded[0]):].tolist() + output_ids = output_ids.reshape([batch_size, num_beams, -1]) output_lines_list = [ - tokenizer.batch_decode(output[:, i, len(line_encoded[0]):], + tokenizer.batch_decode(output_ids[:, i, len(line_encoded[0]):], skip_special_tokens=True) for i in range(num_beams) ] - return output_lines_list, tokens_list + ppls = [] + if args.eval_ppl and batch_size == 1: + # Only for batch size of 1 + seq_lens = [output_ids.size(-1) for _ in range(batch_size)] + context_logits = context_outputs['logits'] + # Remove the first generation logits which are same to last context logits + generation_logits = torch.stack(outputs['scores'][1:], dim=1) + + ppls = [] + for bidx in range(batch_size): + curr_len = seq_lens[bidx] + curr_ctx_len = input_lengths[bidx] + curr_gen_len = curr_len - curr_ctx_len + + curr_ids = output_ids[bidx, 0, 1:curr_len] + curr_logits = torch.cat([ + context_logits[bidx], + generation_logits[bidx, :curr_gen_len - 1] + ], + dim=0) + curr_ppl = ppl(curr_logits, curr_ids) + ppls.append(curr_ppl) + logger.debug( + f"HF PPL: {curr_ppl:.3f} | Generation length: {curr_gen_len}" + ) + + return output_lines_list, tokens_list, ppls if test_trt_llm: datapoint = dataset['test'][0:1] - output, _ = eval_tensorrt_llm(datapoint[dataset_input_key], - eval_type=args.eval_type) + output, *_ = eval_tensorrt_llm(datapoint[dataset_input_key], + eval_type=args.eval_type) if runtime_rank == 0: logger.info( "---------------------------------------------------------") @@ -302,8 +384,8 @@ def main(args): if test_hf: datapoint = dataset['test'][0:1] - output, _ = eval_hf(datapoint[dataset_input_key], - eval_type=args.eval_type) + output, *_ = eval_hf(datapoint[dataset_input_key], + eval_type=args.eval_type) logger.info("---------------------------------------------------------") logger.info("HF Generated : ") logger.info(f" Input : {datapoint[dataset_input_key]}") @@ -316,6 +398,7 @@ def main(args): for i in range(num_beams): metric_tensorrt_llm[i].seed = 0 metric_hf[i].seed = 0 + ppls_trt_llm, ppls_hf = [], [] ite_count = 0 data_point_idx = 0 @@ -330,13 +413,13 @@ def main(args): if test_trt_llm: profiler.start('tensorrt_llm') - output_tensorrt_llm, _ = eval_tensorrt_llm( + output_tensorrt_llm, _, curr_ppls_trt_llm = eval_tensorrt_llm( datapoint[dataset_input_key]) profiler.stop('tensorrt_llm') if test_hf: profiler.start('hf') - output_hf, _ = eval_hf(datapoint[dataset_input_key]) + output_hf, _, curr_ppls_hf = eval_hf(datapoint[dataset_input_key]) profiler.stop('hf') if runtime_rank == 0: @@ -350,6 +433,7 @@ def main(args): references=[ datapoint[dataset_output_key][batch_idx] ]) + ppls_trt_llm.extend(curr_ppls_trt_llm) if test_hf: for beam_idx in range(num_beams): for batch_idx in range(len(output_hf[beam_idx])): @@ -358,6 +442,7 @@ def main(args): references=[ datapoint[dataset_output_key][batch_idx] ]) + ppls_hf.extend(curr_ppls_hf) logger.debug('-' * 100) logger.debug(f"Input : {datapoint[dataset_input_key]}") @@ -388,6 +473,8 @@ def main(args): if args.check_accuracy and beam_idx == 0: assert computed_metrics_tensorrt_llm['rouge1'].mid[ 2] * 100 > args.tensorrt_llm_rouge1_threshold + if args.eval_ppl: + logger.info(f" Per-token perplexity: {np.mean(ppls_trt_llm)}") if test_hf: np.random.seed(0) # rouge score use sampling to compute the score logger.info( @@ -399,6 +486,8 @@ def main(args): for key in computed_metrics_hf.keys(): logger.info( f' {key} : {computed_metrics_hf[key].mid[2]*100}') + if args.eval_ppl and args.batch_size == 1: + logger.info(f" Per-token perplexity: {np.mean(ppls_hf)}") if __name__ == '__main__': @@ -433,6 +522,7 @@ if __name__ == '__main__': default='summarize', choices=['summarize', 'code_completion']) parser.add_argument('--length_penalty', type=float, default=1.0) + parser.add_argument('--eval_ppl', action='store_true') args = parser.parse_args() if args.tokenizer == None: diff --git a/examples/gptj/README.md b/examples/gptj/README.md index ad6fdcf1c9..a66bbcd8b3 100644 --- a/examples/gptj/README.md +++ b/examples/gptj/README.md @@ -14,7 +14,9 @@ code is located in [`examples/gptj`](./). There are three main files in that fol ## Support Matrix * FP16 * FP8 - * INT4 Weight-Only + * INT8 & INT4 per-channel weight-only + * Groupwise quantization (AWQ) + * INT8 KV CACHE (+ AWQ/per-channel weight-only) * FP8 KV CACHE ## Usage @@ -130,6 +132,67 @@ If you find that the default fp16 accumulation (`--enable_context_fmha`) cannot Note `--enable_context_fmha` / `--enable_context_fmha_fp32_acc` has to be used together with `--use_gpt_attention_plugin float16`. +#### INT8 KV cache +INT8 KV cache could be enabled to reduce memory footprint. It will bring more performance gains when batch size gets larger. + +You can get the INT8 scale of KV cache through `hf_gptj_convert.py`: +```bash +# Enable INT8 calibration, and save scales +python hf_gptj_convert.py -i gptj_model -o gptj_int8_model --calibrate-kv-cache -t float16 +``` +Now the FT-format checkpoint with INT8 KV cache scales is saved to `gptj_int8_model/1-gpu`. +You can pass this `gptj_int8_model/1-gpu` directory to `build.py` through the argument called `--ft_model_dir`. + +INT8 KV cache could be combined with either per-channel INT8/INT4 weight-only quantization or per-group INT4 quantization (which is AWQ, actually). + +**INT8 KV cache + per-channel weight-only quantization** + +For example, you can enable INT8 KV cache together with per-channel INT8/INT4 weight-only quantization like the following command. + +**NOTE**: The whole checkpoint together with INT8 KV scales are passed to `--ft_model_dir`. +```bash +# Enable INT8 KV cache together with per-channel INT8 weight-only quantization +python3 build.py --dtype=float16 \ + --log_level=verbose \ + --enable_context_fmha \ + --use_gpt_attention_plugin float16 \ + --use_gemm_plugin float16 \ + --max_batch_size=32 \ + --max_input_len=1919 \ + --max_output_len=128 \ + --remove_input_padding \ + --output_dir=gptj_engine_wo_int8_kv_cache \ + --use_weight_only \ + --weight_only_precision=int8 \ + --int8_kv_cache \ + --ft_model_dir=gptj_ft_model/1-gpu/ +``` + +**INT8 KV cache + AWQ** + +In addition, you can enable INT8 KV cache together with AWQ (per-group INT4 weight-only quantization)like the following command. + +**NOTE**: AWQ checkpoint is passed through `--model_dir`, and the INT8 scales of KV cache is through `--ft_model_dir`. +```bash +# Enable INT8 KV cache together with AWQ +python3 build.py --dtype=float16 \ + --log_level=verbose \ + --enable_context_fmha \ + --use_gpt_attention_plugin float16 \ + --use_gemm_plugin float16 \ + --max_batch_size=32 \ + --max_input_len=1919 \ + --max_output_len=128 \ + --remove_input_padding \ + --output_dir=gptj_engine_awq_int8_kv_cache/ \ + --use_weight_only \ + --per_group \ + --weight_only_precision=int4 \ + --model_dir=awq_int4_weight_only_quantized_models \ + --int8_kv_cache \ + --ft_model_dir=gptj_ft_model/1-gpu/ +``` + #### FP8 KV cache One can enable FP8 for KV cache to reduce memory footprint used by KV cache and improve the accuracy over INT8 KV cache. There are 3 options need to be added to the invocation of `build.py` for that: diff --git a/examples/gptj/build.py b/examples/gptj/build.py index 80a32f157d..9a4c0702bc 100644 --- a/examples/gptj/build.py +++ b/examples/gptj/build.py @@ -16,19 +16,20 @@ import argparse import json import os import time +from pathlib import Path import tensorrt as trt import torch import torch.multiprocessing as mp from transformers import AutoModelForCausalLM -from weight import get_scaling_factors, load_from_awq_gpt_j, load_from_hf_gpt_j +from weight import (get_scaling_factors, load_from_awq_gpt_j, + load_from_bin_gpt_j, load_from_hf_gpt_j, parse_config) import tensorrt_llm from tensorrt_llm.builder import Builder from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models import (weight_only_groupwise_quantize, - weight_only_quantize) +from tensorrt_llm.models import quantize_model from tensorrt_llm.network import net_guard from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.quantization import QuantMode @@ -63,6 +64,13 @@ def parse_arguments(args): type=str, default=None, help='The path to HF GPT-J model / checkpoints to read weights from') + parser.add_argument( + '--ft_model_dir', + type=str, + default=None, + help= + 'The path to FT-format (binary) GPT-J model / checkpoints to read weights from' + ) parser.add_argument('--dtype', type=str, default='float16', @@ -102,12 +110,6 @@ def parse_arguments(args): type=str, default=False, choices=['float16', 'float32']) - parser.add_argument('--use_weight_only_quant_matmul_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16']) parser.add_argument('--use_layernorm_plugin', nargs='?', const='float16', @@ -145,6 +147,13 @@ def parse_arguments(args): help= 'By default, we use dtype for KV cache. fp8_kv_cache chooses fp8 quantization for KV' ) + parser.add_argument( + '--int8_kv_cache', + default=False, + action="store_true", + help= + 'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV' + ) parser.add_argument( '--use_inflight_batching', action="store_true", @@ -236,21 +245,41 @@ def parse_arguments(args): args.n_layer = hf_gpt.config.n_layer args.n_positions = hf_gpt.config.n_positions args.vocab_size = hf_gpt.config.vocab_size - - assert not (args.use_weight_only and args.weight_only_precision - == 'int8'), "Not support int8 weight only." - - assert not (args.use_weight_only and args.weight_only_precision == 'int4' - and args.per_group - == False), "We only support AWQ for int4 weight only." + elif args.ft_model_dir is not None: + logger.info(f"Setting model configuration from {args.ft_model_dir}.") + n_embd, n_head, n_layer, n_positions, vocab_size, _, hidden_act, rotary_pct, bias, inter_size, multi_query_mode, dtype, prompt_num_tasks, prompt_max_vocab_size = parse_config( + Path(args.ft_model_dir) / "config.ini") + args.n_embd = n_embd + args.n_head = n_head + args.n_layer = n_layer + args.n_positions = n_positions + args.vocab_size = vocab_size + args.hidden_act = hidden_act + args.rotary_pct = rotary_pct + args.bias = bias + args.dtype = dtype + args.inter_size = inter_size + args.multi_query_mode = multi_query_mode if args.use_weight_only: - args.quant_mode = QuantMode.use_weight_only( - args.weight_only_precision == 'int4') + if args.per_group: + assert args.weight_only_precision == 'int4', "We only support per-group quantization (AWQ/GPT-Q) with INT4 precision" + args.quant_mode = QuantMode.from_description( + quantize_weights=True, + quantize_activations=False, + per_token=False, + per_channel=False, + per_group=True, + use_int4_weights=True) + else: + args.quant_mode = QuantMode.use_weight_only( + args.weight_only_precision == 'int4') else: args.quant_mode = QuantMode(0) - if args.fp8_kv_cache: + if args.int8_kv_cache: + args.quant_mode = args.quant_mode.set_int8_kv_cache() + elif args.fp8_kv_cache: assert ( args.use_gpt_attention_plugin ), "You have to use GPT attention plugin when fp8 KV cache is set" @@ -289,6 +318,9 @@ def build_rank_engine(builder: Builder, @return: The built engine. ''' kv_dtype = trt.float16 if args.dtype == 'float16' else trt.float32 + mapping = Mapping(world_size=args.world_size, + rank=rank, + tp_size=args.world_size) # TP only # Initialize Module tensorrt_llm_gpt = tensorrt_llm.models.GPTJForCausalLM( @@ -301,28 +333,21 @@ def build_rank_engine(builder: Builder, rotary_dim=args.rotary_dim, dtype=kv_dtype, logits_dtype=args.logits_dtype, - mapping=Mapping(world_size=args.world_size, - rank=rank, - tp_size=args.world_size), # TP only + mapping=mapping, quant_mode=args.quant_mode) - if args.use_weight_only_quant_matmul_plugin: - tensorrt_llm_gpt = weight_only_quantize(tensorrt_llm_gpt) - if args.use_weight_only and args.weight_only_precision == 'int4': - if args.per_group: - tensorrt_llm_gpt = weight_only_groupwise_quantize( - model=tensorrt_llm_gpt, - quant_mode=QuantMode.from_description( - quantize_weights=True, - quantize_activations=False, - per_token=False, - per_channel=False, - per_group=True, - use_int4_weights=True), - group_size=128, - zero=False, - pre_quant_scale=True, - exclude_modules=[], - ) + + quantize_kwargs = {} + if args.use_weight_only and args.per_group: + assert args.weight_only_precision == 'int4' + quantize_kwargs = { + "group_size": 128, + "zero": False, + "pre_quant_scale": True, + "exclude_modules": [], + } + tensorrt_llm_gpt = quantize_model(tensorrt_llm_gpt, args.quant_mode, + **quantize_kwargs) + if args.model_dir is not None: assert hf_gpt is not None, f'Could not load weights from hf_gpt model as it is not loaded yet.' if args.enable_fp8: @@ -333,13 +358,18 @@ def build_rank_engine(builder: Builder, if args.use_weight_only and args.weight_only_precision == 'int4' and args.per_group: load_from_awq_gpt_j(tensorrt_llm_gpt, awq_gpt_j=hf_gpt, + ft_model_dir=args.ft_model_dir, config=awq_gptj_config, + mapping=mapping, fp16=(args.dtype == 'float16')) else: load_from_hf_gpt_j(tensorrt_llm_gpt, hf_gpt, fp16=(args.dtype == 'float16'), scaling_factors=gptj_scaling_factors) + elif args.ft_model_dir is not None: + load_from_bin_gpt_j(tensorrt_llm_gpt, args.ft_model_dir, rank, + args.world_size, args.dtype) # Module -> Network network = builder.create_network() @@ -362,13 +392,13 @@ def build_rank_engine(builder: Builder, if args.enable_context_fmha_fp32_acc: network.plugin_config.set_context_fmha( ContextFMHAType.enabled_with_fp32_acc) - if args.use_weight_only_quant_matmul_plugin: - network.plugin_config.set_weight_only_quant_matmul_plugin( - dtype=args.use_weight_only_quant_matmul_plugin) if args.use_weight_only: if args.per_group: network.plugin_config.set_weight_only_groupwise_quant_matmul_plugin( dtype='float16') + else: + network.plugin_config.set_weight_only_quant_matmul_plugin( + dtype='float16') if args.world_size > 1: network.plugin_config.set_nccl_plugin(args.dtype) if args.remove_input_padding: @@ -400,6 +430,8 @@ def build_rank_engine(builder: Builder, if rank == 0: config_path = os.path.join(args.output_dir, 'config.json') builder.save_config(builder_config, config_path) + + tensorrt_llm.tools.cleanup(network, tensorrt_llm_gpt) return engine @@ -417,6 +449,9 @@ def build(rank, args): # skip other ranks if parallel_build is enabled if args.parallel_build and cur_rank != rank: continue + # NOTE(nkorobov): when only int8 kv cache is used together with paged kv cache no int8 tensors are exposed to TRT + int8_trt_flag = args.quant_mode.has_act_and_weight_quant() or ( + not args.paged_kv_cache and args.quant_mode.has_int8_kv_cache()) builder_config = builder.create_builder_config( name=MODEL_NAME, @@ -435,7 +470,7 @@ def build(rank, args): max_output_len=args.max_output_len, max_num_tokens=args.max_num_tokens, fp8=args.enable_fp8, - int8=args.quant_mode.has_act_or_weight_quant(), + int8=int8_trt_flag, quant_mode=args.quant_mode, strongly_typed=args.strongly_typed) diff --git a/examples/gptj/hf_gptj_convert.py b/examples/gptj/hf_gptj_convert.py new file mode 100644 index 0000000000..073085eaf9 --- /dev/null +++ b/examples/gptj/hf_gptj_convert.py @@ -0,0 +1,349 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +''' +Convert huggingface GPT-J model. Use https://huggingface.co/EleutherAI/gpt-j-6b as demo. +''' +import argparse +import configparser +import dataclasses +import functools +import os +import platform +from collections import defaultdict +from pathlib import Path + +import torch +import torch.multiprocessing as multiprocessing +import torch.nn as nn +from tqdm import tqdm +from transformers import AutoModelForCausalLM # transformers-4.10.0-py3 +from transformers import AutoTokenizer +from transformers.pytorch_utils import Conv1D +from utils.convert import split_and_save_weight + +from tensorrt_llm._utils import str_dtype_to_torch, torch_to_numpy + + +@torch.no_grad() +def capture_activation_range(model, + tokenizer, + dataset, + num_samples=512, + seq_len=512): + model.eval() + device = next(model.parameters()).device + act_scales = defaultdict(lambda: {"x": None, "y": None, "w": None}) + + def stat_tensor(name, tensor, act_scales, key): + hidden_dim = tensor.shape[-1] + tensor = tensor.view(-1, hidden_dim).abs().detach() + comming_max = torch.max(tensor, dim=0)[0].float() + + if act_scales[name][key] is None: + act_scales[name][key] = comming_max + else: + act_scales[name][key] = torch.max(act_scales[name][key], + comming_max) + + def stat_input_hook(m, x, y, name): + if isinstance(x, tuple): + x = x[0] + stat_tensor(name, x, act_scales, "x") + stat_tensor(name, y, act_scales, "y") + + if act_scales[name]["w"] is None: + act_scales[name]["w"] = m.weight.abs().clip(1e-8, + None).max(dim=0)[0] + + hooks = [] + for name, m in model.named_modules(): + if isinstance(m, nn.Linear) or isinstance(m, Conv1D): + hooks.append( + m.register_forward_hook( + functools.partial(stat_input_hook, name=name))) + + for i in tqdm(range(num_samples), desc="calibrating model"): + input_ids = tokenizer(dataset[i]["text"], + return_tensors="pt", + max_length=seq_len, + truncation=True).input_ids.to(device) + model(input_ids) + + for h in hooks: + h.remove() + + return act_scales + + +@dataclasses.dataclass(frozen=True) +class ProgArgs: + out_dir: str + in_file: str + tensor_parallelism: int = 1 + processes: int = 4 + calibrate_kv_cache: bool = False + model: str = "gpt" + storage_type: str = "fp32" + dataset_cache_dir: str = None + load_model_on_cpu: bool = False + convert_model_on_cpu: bool = False + + @staticmethod + def parse(args=None) -> 'ProgArgs': + parser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--out-dir', + '-o', + type=str, + help='file name of output directory', + required=True) + parser.add_argument('--in-file', + '-i', + type=str, + help='file name of input checkpoint file', + required=True) + parser.add_argument('--tensor-parallelism', + '-tp', + type=int, + help='Requested tensor parallelism for inference', + default=1) + parser.add_argument( + "--processes", + "-p", + type=int, + help= + "How many processes to spawn for conversion (default: 4). Set it to a lower value to reduce RAM usage.", + default=4) + parser.add_argument( + "--calibrate-kv-cache", + "-kv", + action="store_true", + help= + "Generate scaling factors for KV cache. Used for storing KV cache in int8." + ) + parser.add_argument( + "--model", + default="gpt2", + type=str, + help="Specify GPT variants to convert checkpoints correctly", + choices=["gpt2", "santacoder", "starcoder"]) + parser.add_argument("--storage-type", + "-t", + type=str, + default="float32", + choices=["float32", "float16", "bfloat16"]) + parser.add_argument("--dataset-cache-dir", + type=str, + default=None, + help="cache dir to load the hugging face dataset") + parser.add_argument("--load-model-on-cpu", action="store_true") + parser.add_argument("--convert-model-on-cpu", action="store_true") + return ProgArgs(**vars(parser.parse_args(args))) + + +def merge_qkv_scales(q_name, hf_model, scales, gptj_qkv_para): + layer_name_q = q_name.replace(".weight", "") + layer_name_k = layer_name_q.replace("q_proj", "k_proj") + layer_name_v = layer_name_q.replace("q_proj", "v_proj") + layer_name_qkv = layer_name_q.replace("q_proj", "qkv_proj") + + q = hf_model.state_dict()[layer_name_q + ".weight"] + k = hf_model.state_dict()[layer_name_k + ".weight"] + v = hf_model.state_dict()[layer_name_v + ".weight"] + + weight = torch.cat([q, k, v], dim=0) + + scales[layer_name_qkv]["x"] = scales[layer_name_q]["x"] + scales[layer_name_qkv]["w"] = weight.abs().max(dim=1)[0] + + scales[layer_name_qkv]["y"] = torch.cat([ + scales[layer_name_q]["y"], scales[layer_name_k]["y"], + scales[layer_name_v]["y"] + ], + dim=0) + + gptj_qkv_para[layer_name_qkv] = weight.transpose(0, 1) + + +def gptj_to_trt_llm_name(orig_name): + global_weights = { + "transformer.wte.weight": "model.wte", + "transformer.ln_f.bias": "model.final_layernorm.bias", + "transformer.ln_f.weight": "model.final_layernorm.weight", + "lm_head.weight": "model.lm_head.weight", + "lm_head.bias": "model.lm_head.bias" + } + + if orig_name in global_weights: + return global_weights[orig_name] + + _, _, layer_id, *weight_name = orig_name.split(".") + layer_id = int(layer_id) + weight_name = "transformer." + ".".join(weight_name) + + per_layer_weights = { + "transformer.ln_1.bias": "input_layernorm.bias", + "transformer.ln_1.weight": "input_layernorm.weight", + "transformer.attn.q_proj.weight": "attention.query.weight", + "transformer.attn.q_proj.bias": "attention.query.bias", + "transformer.attn.k_proj.weight": "attention.key.weight", + "transformer.attn.k_proj.bias": "attention.key.bias", + "transformer.attn.v_proj.weight": "attention.value.weight", + "transformer.attn.v_proj.bias": "attention.value.bias", + "transformer.attn.out_proj.bias": "attention.dense.bias", + "transformer.attn.out_proj.weight": "attention.dense.weight", + "transformer.mlp.fc_in.bias": "mlp.dense_h_to_4h.bias", + "transformer.mlp.fc_in.weight": "mlp.dense_h_to_4h.weight", + "transformer.mlp.fc_out.bias": "mlp.dense_4h_to_h.bias", + "transformer.mlp.fc_out.weight": "mlp.dense_4h_to_h.weight", + } + return f"layers.{layer_id}.{per_layer_weights[weight_name]}" + + +# GPT-J uses nn.Linear for these following ops whose weight matrix is transposed compared to gpt2. +# In order to use the preprocess codes of gpt2, we transpose them firstly. +def transpose_weights(hf_name, param): + weight_to_transpose = ["out_proj", "fc_in", "fc_out"] + if any([k in hf_name for k in weight_to_transpose]): + if len(param.shape) == 2: + param = param.transpose(0, 1) + return param + + +@torch.no_grad() +def hf_gptj_converter(args: ProgArgs): + infer_tp = args.tensor_parallelism + multi_query_mode = False + saved_dir = Path(args.out_dir) / f"{infer_tp}-gpu" + saved_dir.mkdir(parents=True, exist_ok=True) + + # load position_embedding from rank 0 + model = AutoModelForCausalLM.from_pretrained(args.in_file, + device_map="auto", + trust_remote_code=True) + if args.load_model_on_cpu: + model = model.cpu() + torch.cuda.empty_cache() + act_range = {} + gptj_qkv_para = {} + + if args.calibrate_kv_cache: + os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get( + "TOKENIZERS_PARALLELISM", "false") + from datasets import load_dataset + dataset = load_dataset("lambada", + split="validation", + cache_dir=args.dataset_cache_dir) + act_range = capture_activation_range( + model, AutoTokenizer.from_pretrained(args.in_file), dataset) + + config = configparser.ConfigParser() + config["gpt"] = {} + for key in vars(args): + config["gpt"][key] = f"{vars(args)[key]}" + for k, v in vars(model.config).items(): + config["gpt"][k] = f"{v}" + config["gpt"]["storage_dtype"] = args.storage_type + config["gpt"]["multi_query_mode"] = str(multi_query_mode) + with open(saved_dir / "config.ini", 'w') as configfile: + config.write(configfile) + + storage_type = str_dtype_to_torch(args.storage_type) + + global_ft_weights = [ + "model.wte", "model.final_layernorm.bias", + "model.final_layernorm.weight", "model.lm_head.weight", + "model.lm_head.bias" + ] + + int8_outputs = None + if args.calibrate_kv_cache: + int8_outputs = "kv_cache_only" + + starmap_args = [] + for name, param in model.named_parameters(): + if "weight" not in name and "bias" not in name: + continue + trt_llm_name = gptj_to_trt_llm_name(name) + + param = transpose_weights(name, param) + + if args.convert_model_on_cpu: + param = param.cpu() + if trt_llm_name in global_ft_weights: + torch_to_numpy(param.to(storage_type).cpu()).tofile( + saved_dir / f"{trt_llm_name}.bin") + elif 'q_proj' in name: + trt_llm_name = trt_llm_name.replace("query", "query_key_value") + # Needed by QKV projection weight split. With multi_query_mode one does not simply take + # out_dim and divide it by 3 to get local_dim because out_dim = local_dim + 2 * head_size + local_dim = model.transformer.h[ + 0].attn.embed_dim if multi_query_mode else None + merge_qkv_scales(name, model, act_range, gptj_qkv_para) + qkv = (0, saved_dir, infer_tp, trt_llm_name, + gptj_qkv_para.get( + name.replace(".weight", + "").replace(".q_proj", + ".qkv_proj")).to(storage_type), + storage_type, + act_range.get( + name.replace(".weight", + "").replace(".q_proj", ".qkv_proj")), { + "int8_outputs": int8_outputs, + "multi_query_mode": multi_query_mode, + "local_dim": local_dim + }) + starmap_args.append(qkv) + elif 'k_proj' in name or 'v_proj' in name: + continue + else: + starmap_args.append( + (0, saved_dir, infer_tp, trt_llm_name, param.to(storage_type), + storage_type, act_range.get(name.replace(".weight", "")), { + "int8_outputs": int8_outputs, + "multi_query_mode": multi_query_mode, + "local_dim": None + })) + + starmap_args = tqdm(starmap_args, desc="saving weights") + if args.processes > 1: + with multiprocessing.Pool(args.processes) as pool: + pool.starmap(split_and_save_weight, starmap_args) + else: + # simpler for debug situations + for starmap_arg in starmap_args: + split_and_save_weight(*starmap_arg) + + +def run_conversion(args: ProgArgs): + if args.processes > 1 and platform.system() == "Windows": + print( + "Resetting processes to 1 because multi-process on Windows is not implemented." + ) + args = dataclasses.replace(args, processes=1) + + print("\n=============== Arguments ===============") + for key, value in vars(args).items(): + print(f"{key}: {value}") + print("========================================") + + assert (args.calibrate_kv_cache), \ + "INT8 kv cache must be enabled for this script. Otherwise you can directly build engines from HuggingFace checkpoints, no need to do this FT-format conversion. " + hf_gptj_converter(args) + + +if __name__ == "__main__": + torch.multiprocessing.set_start_method("spawn") + run_conversion(ProgArgs.parse()) diff --git a/examples/gptj/utils/convert.py b/examples/gptj/utils/convert.py new file mode 100644 index 0000000000..f2f204b985 --- /dev/null +++ b/examples/gptj/utils/convert.py @@ -0,0 +1,273 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + Utilities for exporting a model to our custom format. +""" + +import numpy as np +import torch + +from tensorrt_llm._utils import torch_to_numpy + + +def cpu_map_location(storage, loc): + return storage.cpu() + + +def gpu_map_location(storage, loc): + if loc.startswith("cuda"): + training_gpu_idx = int(loc.split(":")[1]) + inference_gpu_idx = training_gpu_idx % torch.cuda.device_count() + return storage.cuda(inference_gpu_idx) + elif loc.startswith("cpu"): + return storage.cpu() + else: + raise ValueError(f"Not handled {loc}") + + +def save_val(val, dir, key, tp_num=None): + suffix = "bin" if tp_num is None else f"{tp_num}.bin" + val.tofile(dir / f"model.{key}.{suffix}") + + +def save_split(split_vals, dir, key, i, split_factor): + for j, val in enumerate(split_vals): + save_val(val, dir, key, i * split_factor + j) + + +def generate_int8(weights, act_range, is_qkv=False, multi_query_mode=False): + """ + This function has two purposes: + - compute quantized weights, scaled either per-tensor or per-column + - compute scaling factors + + Depending on the GEMM API (CUTLASS/CUBLAS) the required scaling factors differ. + CUTLASS uses two sets of scaling factors. One for the activation X, one for the weight W. + CUBLAS only has one (we can't do per-row scaling). So we must provide pre-multiplied scaling factor. + + Here is the list of what we need (T means per-tensor, C per-column): + - scale_x_orig_quant puts fp activation into the quantized range (i.e. [-128, 127], for int8). Used before the GEMM. (T) + - scale_y_quant_orig puts quantized activation into the fp range. Used if the GEMM outputs int8. (T) + - scale_w_quant_orig puts weights from quant range to fp range (used with CUTLASS) (T, C) + - scale_y_accum_quant puts the GEMM result (XW) from accumulation range (int32) + to quant range (int8) (used for CUBLAS) (T, C) + + Note that we don't do anything special about row-parallel GEMM. Theoretically, we could have per-GPU scaling factors too, + but then the model would change depending on the number of GPUs used. + + For QKV projection, the behavior is special. Even if we have a single matrix to perform QKV projection, we consider it + as three different matrices: Q, K, and V. So per-tensor actually means one scaling factor for each Q, K and V. + """ + + # compute weight scaling factors for fp->int8 and int8->fp + if is_qkv and not multi_query_mode: + scale_w_orig_quant_t = 127. / act_range["w"].reshape(3, -1).max( + dim=-1, keepdims=True)[0].cpu().numpy() + scale_w_orig_quant_c = 127. / act_range["w"].reshape(3, + -1).cpu().numpy() + elif is_qkv and multi_query_mode: + raise ValueError( + f"Multi-query w/ int8 quant has not been supported yet") + else: + scale_w_orig_quant_t = 127. / act_range["w"].max().cpu().numpy() + scale_w_orig_quant_c = 127. / act_range["w"].cpu().numpy() + scale_w_quant_orig_t = 1.0 / scale_w_orig_quant_t + scale_w_quant_orig_c = 1.0 / scale_w_orig_quant_c + + # compute the rest of needed scaling factors + scale_x_orig_quant_t = np.array(127. / act_range["x"].max().item()) + scale_y_orig_quant_t = np.array(127. / act_range["y"].max().item()) + scale_y_quant_orig_t = np.array(act_range["y"].max().item() / 127.) + scale_y_accum_quant_t = scale_y_orig_quant_t / (scale_x_orig_quant_t * + scale_w_orig_quant_t) + scale_y_accum_quant_c = scale_y_orig_quant_t / (scale_x_orig_quant_t * + scale_w_orig_quant_c) + if is_qkv: + scale_y_accum_quant_t = np.broadcast_to(scale_y_accum_quant_t, + scale_w_orig_quant_c.shape) + scale_w_quant_orig_t = np.broadcast_to(scale_w_quant_orig_t, + scale_w_orig_quant_c.shape) + + to_i8 = lambda x: x.round().clip(-127, 127).astype(np.int8) + return { + "weight.int8": to_i8(weights * scale_w_orig_quant_t), + "weight.int8.col": to_i8(weights * scale_w_orig_quant_c), + "scale_x_orig_quant": scale_x_orig_quant_t.astype(np.float32), + "scale_w_quant_orig": scale_w_quant_orig_t.astype(np.float32), + "scale_w_quant_orig.col": scale_w_quant_orig_c.astype(np.float32), + "scale_y_accum_quant": scale_y_accum_quant_t.astype(np.float32), + "scale_y_accum_quant.col": scale_y_accum_quant_c.astype(np.float32), + "scale_y_quant_orig": scale_y_quant_orig_t.astype(np.float32), + } + + +def write_int8(vals, + dir, + base_key, + split_dim, + tp_rank, + split_factor, + kv_cache_only=False): + if not kv_cache_only: + save_split(np.split(vals["weight.int8"], split_factor, axis=split_dim), + dir, f"{base_key}.weight.int8", tp_rank, split_factor) + save_split( + np.split(vals["weight.int8.col"], split_factor, axis=split_dim), + dir, f"{base_key}.weight.int8.col", tp_rank, split_factor) + + saved_keys_once = ["scale_y_quant_orig"] + if not kv_cache_only: + saved_keys_once += [ + "scale_x_orig_quant", "scale_w_quant_orig", "scale_y_accum_quant" + ] + # per-column scaling factors are loaded per-gpu for ColumnParallel GEMMs (QKV, FC1) + if not kv_cache_only: + if split_dim == -1: + save_split( + np.split(vals["scale_w_quant_orig.col"], + split_factor, + axis=split_dim), dir, + f"{base_key}.scale_w_quant_orig.col", tp_rank, split_factor) + save_split( + np.split(vals["scale_y_accum_quant.col"], + split_factor, + axis=split_dim), dir, + f"{base_key}.scale_y_accum_quant.col", tp_rank, split_factor) + else: + saved_keys_once += [ + "scale_w_quant_orig.col", "scale_y_accum_quant.col" + ] + + if tp_rank == 0: + for save_key in saved_keys_once: + save_val(vals[save_key], dir, f"{base_key}.{save_key}") + + +# Note: in multi_query_mode, only query heads are split between multiple GPUs, while key/value head +# are not split as there is only one head per key/value. +@torch.no_grad() +def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals, + storage_type, act_range, config): + use_attention_nemo_shape = config.get("use_attention_nemo_shape", False) + split_gated_activation = config.get("split_gated_activation", False) + num_attention_heads = config.get("num_attention_heads", 0) + tp_size = config.get("tp_size", 1) + int8_outputs = config.get("int8_outputs", None) + multi_query_mode = config.get("multi_query_mode", False) + local_dim = config.get("local_dim", None) + + save_int8 = int8_outputs == "all" or int8_outputs == "kv_cache_only" + + if not isinstance(vals, list): + vals = [vals] + + if config.get("transpose_weights", False) and vals[0].ndim == 2: + vals = [val.T for val in vals] + if "layernorm.weight" in key and config.get("apply_layernorm_1p", False): + vals = [val + 1.0 for val in vals] + vals = [torch_to_numpy(val.cpu().to(storage_type)) for val in vals] + + if "input_layernorm.weight" in key or "input_layernorm.bias" in key or \ + "final_layernorm.weight" in key or "final_layernorm.bias" in key or \ + "mlp.dense_4h_to_h.bias" in key: + + # shared weights, only need to convert the weights of rank 0 + if tp_rank == 0: + save_val(vals[0], saved_dir, key) + + elif "attention.dense.weight" in key or "mlp.dense_4h_to_h.weight" in key: + cat_dim = 0 + val = np.concatenate(vals, axis=cat_dim) + split_vals = np.split(val, split_factor, axis=cat_dim) + save_split(split_vals, saved_dir, key, tp_rank, split_factor) + if act_range is not None and int8_outputs == "all": + base_key = key.replace(".weight", "") + vals_i8 = generate_int8(val, + act_range, + multi_query_mode=multi_query_mode) + write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank, + split_factor) + + elif "mlp.dense_h_to_4h.weight" in key or "mlp.dense_h_to_4h.bias" in key: + if split_gated_activation: + splits = [np.split(val, 2, axis=-1) for val in vals] + vals, gates = list(zip(*splits)) + cat_dim = -1 + val = np.concatenate(vals, axis=cat_dim) + split_vals = np.split(val, split_factor, axis=cat_dim) + save_split(split_vals, saved_dir, key, tp_rank, split_factor) + if act_range is not None and int8_outputs == "all": + base_key = key.replace(".weight", "") + vals_i8 = generate_int8(val, + act_range, + multi_query_mode=multi_query_mode) + write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank, + split_factor) + + if split_gated_activation: + assert not save_int8 + prefix, dot, suffix = key.rpartition(".") + key = prefix + ".gate" + dot + suffix + + gate = np.concatenate(gates, axis=cat_dim) + split_vals = np.split(gate, split_factor, axis=cat_dim) + save_split(split_vals, saved_dir, key, tp_rank, split_factor) + + elif "attention.query_key_value.weight" in key: + hidden_dim = vals[0].shape[0] + if local_dim is None: + local_dim = vals[0].shape[-1] // 3 + if multi_query_mode: + val = vals[0] + # out_feature = local_dim + 2 * head_size; assumes local_dim equals to hidden_dim + head_size = (val.shape[-1] - local_dim) // 2 + val = val.reshape(hidden_dim, local_dim + 2 * head_size) + w_q, w_kv = np.split(val, [local_dim], axis=-1) + w_q_split = np.split(w_q, split_factor, axis=-1) + split_vals = [np.concatenate((i, w_kv), axis=-1) for i in w_q_split] + else: + if use_attention_nemo_shape: + head_num = num_attention_heads // tp_size + size_per_head = hidden_dim // num_attention_heads + vals = [ + val.reshape(hidden_dim, head_num, 3, size_per_head) + for val in vals + ] + vals = [val.transpose(0, 2, 1, 3) for val in vals] + + vals = [val.reshape(hidden_dim, 3, local_dim) for val in vals] + cat_dim = -1 + val = np.concatenate(vals, axis=cat_dim) + split_vals = np.split(val, split_factor, axis=cat_dim) + save_split(split_vals, saved_dir, key, tp_rank, split_factor) + if save_int8: + base_key = key.replace(".weight", "") + vals_i8 = generate_int8(val, + act_range, + is_qkv=True, + multi_query_mode=multi_query_mode) + write_int8(vals_i8, + saved_dir, + base_key, + cat_dim, + tp_rank, + split_factor, + kv_cache_only=int8_outputs == "kv_cache_only") + elif ("attention.query.weight" in key or "attention.query.bias" in key + or "attention.key_value.weight" in key + or "attention.key_value.bias" in key): + pass + else: + assert False, f"[ERROR] {key} not handled by converter" diff --git a/examples/gptj/weight.py b/examples/gptj/weight.py index 8867fb4b41..9ee593d34d 100644 --- a/examples/gptj/weight.py +++ b/examples/gptj/weight.py @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import configparser import time from operator import attrgetter from pathlib import Path @@ -22,6 +23,8 @@ import torch import tensorrt_llm import tensorrt_llm.logger as logger +from tensorrt_llm._utils import pad_vocab_size, str_dtype_to_np +from tensorrt_llm.mapping import Mapping from tensorrt_llm.models import GPTJForCausalLM from tensorrt_llm.models.quantized.quant import get_dummy_quant_scales from tensorrt_llm.quantization import QuantMode @@ -108,6 +111,385 @@ def get_scaling_factors( return scaling_factor +def gen_suffix(rank, use_smooth_quant, quant_per_channel): + suffix = f"{rank}.bin" + if use_smooth_quant: + sq_prefix = "int8." + if quant_per_channel: + sq_prefix += "col." + suffix = sq_prefix + suffix + return suffix + + +def extract_layer_idx(name): + ss = name.split('.') + for s in ss: + if s.isdigit(): + return s + return None + + +def split(v, tp_size, idx, dim=0): + if tp_size == 1: + return v + if len(v.shape) == 1: + return np.ascontiguousarray(np.split(v, tp_size)[idx]) + elif len(v.shape) == 2: + return np.ascontiguousarray(np.split(v, tp_size, axis=dim)[idx]) + return None + + +def parse_config(ini_file): + gpt_config = configparser.ConfigParser() + gpt_config.read(ini_file) + + n_embd = gpt_config.getint('gpt', 'n_embd') + n_head = gpt_config.getint('gpt', 'n_head') + n_layer = gpt_config.getint('gpt', 'n_layer') + n_positions = gpt_config.getint('gpt', 'n_positions') + vocab_size = gpt_config.getint('gpt', 'vocab_size') + do_layer_norm_before = gpt_config.getboolean('gpt', + 'do_layer_norm_before', + fallback=True) + rotary_pct = gpt_config.getfloat('gpt', 'rotary_pct', fallback=0.0) + hidden_act = gpt_config.get('gpt', 'activation_function') + bias = gpt_config.getboolean('gpt', 'bias', fallback=True) + inter_size = gpt_config.getint('gpt', 'intermediate_size', fallback=None) + dtype = gpt_config.get('gpt', 'storage_dtype', fallback='float32') + + if inter_size is None: + inter_size = 4 * n_embd + + multi_query_mode = gpt_config.getboolean('gpt', + 'multi_query_mode', + fallback=False) + prompt_num_tasks = gpt_config.getint('gpt', 'prompt_num_tasks', fallback=0) + prompt_max_vocab_size = gpt_config.getint('gpt', + 'prompt_max_vocab_size', + fallback=0) + return n_embd, n_head, n_layer, n_positions, vocab_size, do_layer_norm_before, hidden_act, rotary_pct, bias, inter_size, multi_query_mode, dtype, prompt_num_tasks, prompt_max_vocab_size + + +def load_from_bin_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM, + dir_path, + rank=0, + tensor_parallel=1, + dtype='float32', + use_parallel_embedding=False, + sharding_dim=0, + share_embedding_table=False, + scaling_factors=None): + tensorrt_llm.logger.info('Loading weights from bin...') + tik = time.time() + + quant_mode = getattr(tensorrt_llm_gpt_j, 'quant_mode', QuantMode(0)) + if quant_mode.is_int8_weight_only(): + plugin_weight_only_quant_type = torch.int8 + elif quant_mode.is_int4_weight_only(): + plugin_weight_only_quant_type = torch.quint4x2 + n_embd, n_head, n_layer, n_positions, vocab_size, do_layer_norm_before, hidden_act, rotary_pct, bias, inter_size, multi_query_mode, *_ = parse_config( + Path(dir_path) / 'config.ini') + np_dtype = str_dtype_to_np(dtype) + + def fromfile(dir_path, name, shape=None, dtype=None): + dtype = np_dtype if dtype is None else dtype + p = dir_path + '/' + name + if Path(p).exists(): + t = np.fromfile(p, dtype=dtype) + if shape is not None: + t = t.reshape(shape) + return t + return None + + def set_smoothquant_scale_factors(module, + pre_scale_weight, + dir_path, + basename, + shape, + per_tok_dyn, + per_channel, + is_qkv=False, + rank=None): + suffix = "bin" + if per_channel: + if rank is not None: + suffix = f"{rank}." + suffix + suffix = "col." + suffix + + col_shape = shape if (per_channel or is_qkv) else [1, 1] + if per_tok_dyn: + if pre_scale_weight is not None: + pre_scale_weight.value = np.array([1.0], dtype=np.float32) + t = fromfile(dir_path, f"{basename}scale_w_quant_orig.{suffix}", + col_shape, np.float32) + module.per_channel_scale.value = t + else: + t = fromfile(dir_path, f"{basename}scale_x_orig_quant.bin", [1], + np.float32) + pre_scale_weight.value = t + t = fromfile(dir_path, f"{basename}scale_y_accum_quant.{suffix}", + col_shape, np.float32) + module.per_channel_scale.value = t + t = fromfile(dir_path, f"{basename}scale_y_quant_orig.bin", [1, 1], + np.float32) + module.act_scale.value = t + + # Do we use SmoothQuant? + use_smooth_quant = quant_mode.has_act_and_weight_quant() + # Do we use quantization per token? + quant_per_token_dyn = quant_mode.has_per_token_dynamic_scaling() + # Do we use quantization per channel? + quant_per_channel = quant_mode.has_per_channel_scaling() + + # Do we use INT4/INT8 weight-only? + use_weight_only = quant_mode.is_weight_only() + + # Int8 KV cache + use_int8_kv_cache = quant_mode.has_int8_kv_cache() + + #Enable FP8 Gemm + enable_fp8_qdq = quant_mode.has_fp8_qdq() + + def sq_trick(x): + return x.view(np.float32) if use_smooth_quant else x + + # Debug + suffix = gen_suffix(rank, use_smooth_quant, quant_per_channel) + # The type of weights. + w_type = np_dtype if not use_smooth_quant else np.int8 + + # pe = fromfile(dir_path, 'model.wpe.bin', [n_positions, n_embd]) + # if pe is not None: + # tensorrt_llm_gpt_j.embedding.position_embedding.weight.value = (pe) + + vocab_embedding_weight = fromfile(dir_path, 'model.wte.bin', + [vocab_size, n_embd]) + if not use_parallel_embedding: + tensorrt_llm_gpt_j.embedding.weight.value = vocab_embedding_weight + else: + if sharding_dim == 0: + if vocab_size % tensor_parallel != 0: + # padding + vocab_size_padded = pad_vocab_size( + tensorrt_llm_gpt_j.embedding.num_embeddings, + tensor_parallel) + pad_width = vocab_size_padded - vocab_size + vocab_embedding_weight = np.pad(vocab_embedding_weight, + ((0, pad_width), (0, 0)), + 'constant', + constant_values=0) + tensorrt_llm_gpt_j.embedding.weight.value = np.ascontiguousarray( + split(vocab_embedding_weight, + tensor_parallel, + rank, + dim=sharding_dim)) + + if do_layer_norm_before: + tensorrt_llm_gpt_j.ln_f.bias.value = (fromfile( + dir_path, 'model.final_layernorm.bias.bin')) + tensorrt_llm_gpt_j.ln_f.weight.value = (fromfile( + dir_path, 'model.final_layernorm.weight.bin')) + + # share input embedding + if not share_embedding_table: + lm_head_weight = fromfile(dir_path, 'model.lm_head.weight.bin', + [vocab_size, n_embd]) + lm_head_bias = fromfile(dir_path, 'model.lm_head.bias.bin', + [vocab_size]) + if lm_head_weight is None: + lm_head_weight = fromfile(dir_path, 'model.wte.bin', + [vocab_size, n_embd]) + if vocab_size % tensor_parallel != 0: + # padding + vocab_size_padded = tensorrt_llm_gpt_j.lm_head.out_features * tensor_parallel + pad_width = vocab_size_padded - vocab_size + lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)), + 'constant', + constant_values=0) + tensorrt_llm_gpt_j.lm_head.weight.value = np.ascontiguousarray( + split(lm_head_weight, tensor_parallel, rank)) + tensorrt_llm_gpt_j.lm_head.bias.value = np.ascontiguousarray( + split(lm_head_bias, tensor_parallel, rank)) + fake_fp8_sf_dt = np.float32 + for i in range(n_layer): + c_attn_out_dim = (3 * n_embd // + tensor_parallel) if not multi_query_mode else ( + n_embd // tensor_parallel + + (n_embd // n_head) * 2) + tensorrt_llm_gpt_j.layers[i].input_layernorm.weight.value = (fromfile( + dir_path, 'model.layers.' + str(i) + '.input_layernorm.weight.bin')) + tensorrt_llm_gpt_j.layers[i].input_layernorm.bias.value = (fromfile( + dir_path, 'model.layers.' + str(i) + '.input_layernorm.bias.bin')) + t = fromfile( + dir_path, 'model.layers.' + str(i) + + '.attention.query_key_value.weight.' + suffix, + [n_embd, c_attn_out_dim], w_type) + if t is not None: + dst = tensorrt_llm_gpt_j.layers[i].attention.qkv.weight + if use_smooth_quant: + dst.value = sq_trick( + np.ascontiguousarray(np.transpose(t, [1, 0]))) + set_smoothquant_scale_factors( + tensorrt_llm_gpt_j.layers[i].attention.qkv, + tensorrt_llm_gpt_j.layers[i].input_layernorm.scale_to_int, + dir_path, + 'model.layers.' + str(i) + '.attention.query_key_value.', + [1, c_attn_out_dim], + quant_per_token_dyn, + quant_per_channel, + rank=rank, + is_qkv=True) + elif use_weight_only: + processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + torch.tensor(t), plugin_weight_only_quant_type) + dst.value = processed_torch_weights.numpy() + scales = tensorrt_llm_gpt_j.layers[ + i].attention.qkv.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + dst.value = np.ascontiguousarray(np.transpose(t, [1, 0])) + + if enable_fp8_qdq: + tensorrt_llm_gpt_j.layers[ + i].attention.qkv.activation_scaling_factor.value = np.array( + [scaling_factors['qkv_act'][i]], dtype=fake_fp8_sf_dt) + tensorrt_llm_gpt_j.layers[ + i].attention.qkv.weights_scaling_factor.value = np.array( + [scaling_factors['qkv_weights'][i]], dtype=fake_fp8_sf_dt) + tensorrt_llm_gpt_j.layers[ + i].attention.kv_orig_quant_scale.value = np.array( + [scaling_factors['qkv_output'][i]], dtype=np.float32) + tensorrt_llm_gpt_j.layers[ + i].attention.kv_quant_orig_scale.value = np.array( + [1.0 / scaling_factors['qkv_output'][i]], dtype=np.float32) + + dst = tensorrt_llm_gpt_j.layers[i].attention.dense.weight + t = fromfile( + dir_path, + 'model.layers.' + str(i) + '.attention.dense.weight.' + suffix, + [n_embd // tensor_parallel, n_embd], w_type) + if use_smooth_quant: + dst.value = sq_trick(np.ascontiguousarray(np.transpose(t, [1, 0]))) + dense_scale = getattr(tensorrt_llm_gpt_j.layers[i].attention, + "quantization_scaling_factor", None) + set_smoothquant_scale_factors( + tensorrt_llm_gpt_j.layers[i].attention.dense, dense_scale, + dir_path, 'model.layers.' + str(i) + '.attention.dense.', + [1, n_embd], quant_per_token_dyn, quant_per_channel) + # change it to the real smoother if dense layer is applied smooth quant + tensorrt_llm_gpt_j.layers[ + i].attention.dense.smoother.value = np.ones( + [1, n_embd // tensor_parallel], dtype=np.float32) + elif use_weight_only: + processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + torch.tensor(t), plugin_weight_only_quant_type) + dst.value = processed_torch_weights.numpy() + scales = tensorrt_llm_gpt_j.layers[ + i].attention.dense.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + dst.value = np.ascontiguousarray(np.transpose(t, [1, 0])) + + if enable_fp8_qdq: + tensorrt_llm_gpt_j.layers[ + i].attention.dense.activation_scaling_factor.value = np.array( + [scaling_factors['dense_act'][i]], dtype=fake_fp8_sf_dt) + tensorrt_llm_gpt_j.layers[ + i].attention.dense.weights_scaling_factor.value = np.array( + [scaling_factors['dense_weights'][i]], dtype=fake_fp8_sf_dt) + + t = fromfile( + dir_path, + 'model.layers.' + str(i) + '.mlp.dense_h_to_4h.weight.' + suffix, + [n_embd, inter_size // tensor_parallel], w_type) + if use_smooth_quant: + tensorrt_llm_gpt_j.layers[i].mlp.fc.weight.value = sq_trick( + np.ascontiguousarray(np.transpose(t, [1, 0]))) + set_smoothquant_scale_factors( + tensorrt_llm_gpt_j.layers[i].mlp.fc, + tensorrt_llm_gpt_j.layers[i].post_layernorm.scale_to_int, + dir_path, + 'model.layers.' + str(i) + '.mlp.dense_h_to_4h.', + [1, inter_size // tensor_parallel], + quant_per_token_dyn, + quant_per_channel, + rank=rank) + elif use_weight_only: + dst = tensorrt_llm_gpt_j.layers[i].mlp.fc.weight + processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + torch.tensor(t), plugin_weight_only_quant_type) + dst.value = processed_torch_weights.numpy() + scales = tensorrt_llm_gpt_j.layers[i].mlp.fc.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + tensorrt_llm_gpt_j.layers[ + i].mlp.fc.weight.value = np.ascontiguousarray( + np.transpose(t, [1, 0])) + if bias: + tensorrt_llm_gpt_j.layers[i].mlp.fc.bias.value = fromfile( + dir_path, 'model.layers.' + str(i) + + '.mlp.dense_h_to_4h.bias.' + str(rank) + '.bin') + if enable_fp8_qdq: + tensorrt_llm_gpt_j.layers[ + i].mlp.fc.activation_scaling_factor.value = np.array( + [scaling_factors['fc_act'][i]], dtype=fake_fp8_sf_dt) + tensorrt_llm_gpt_j.layers[ + i].mlp.fc.weights_scaling_factor.value = np.array( + [scaling_factors['fc_weights'][i]], dtype=fake_fp8_sf_dt) + + t = fromfile( + dir_path, + 'model.layers.' + str(i) + '.mlp.dense_4h_to_h.weight.' + suffix, + [inter_size // tensor_parallel, n_embd], w_type) + if use_smooth_quant: + tensorrt_llm_gpt_j.layers[i].mlp.proj.weight.value = sq_trick( + np.ascontiguousarray(np.transpose(t, [1, 0]))) + proj_scale = getattr(tensorrt_llm_gpt_j.layers[i].mlp, + "quantization_scaling_factor", None) + set_smoothquant_scale_factors( + tensorrt_llm_gpt_j.layers[i].mlp.proj, proj_scale, dir_path, + 'model.layers.' + str(i) + '.mlp.dense_4h_to_h.', [1, n_embd], + quant_per_token_dyn, quant_per_channel) + # change it to the real smoother if proj layer is applied smooth quant + tensorrt_llm_gpt_j.layers[i].mlp.proj.smoother.value = np.ones( + [1, inter_size // tensor_parallel], dtype=np.float32) + elif use_weight_only: + dst = tensorrt_llm_gpt_j.layers[i].mlp.proj.weight + processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + torch.tensor(t), plugin_weight_only_quant_type) + dst.value = processed_torch_weights.numpy() + scales = tensorrt_llm_gpt_j.layers[i].mlp.proj.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + tensorrt_llm_gpt_j.layers[i].mlp.proj.weight.value = ( + np.ascontiguousarray(np.transpose(t, [1, 0]))) + if bias: + tensorrt_llm_gpt_j.layers[i].mlp.proj.bias.value = fromfile( + dir_path, + 'model.layers.' + str(i) + '.mlp.dense_4h_to_h.bias.bin') + + if use_int8_kv_cache: + t = fromfile( + dir_path, 'model.layers.' + str(i) + + '.attention.query_key_value.scale_y_quant_orig.bin', [1], + np.float32) + tensorrt_llm_gpt_j.layers[ + i].attention.kv_orig_quant_scale.value = 1.0 / t + tensorrt_llm_gpt_j.layers[i].attention.kv_quant_orig_scale.value = t + + if enable_fp8_qdq: + tensorrt_llm_gpt_j.layers[ + i].mlp.proj.activation_scaling_factor.value = np.array( + [scaling_factors['proj_act'][i]], dtype=fake_fp8_sf_dt) + tensorrt_llm_gpt_j.layers[ + i].mlp.proj.weights_scaling_factor.value = np.array( + [scaling_factors['proj_weights'][i]], dtype=fake_fp8_sf_dt) + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}') + + def load_from_hf_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM, hf_gpt_j, fp16=False, @@ -132,6 +514,13 @@ def load_from_hf_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM, ] quant_mode = getattr(tensorrt_llm_gpt_j, 'quant_mode', QuantMode(0)) + if quant_mode.is_int8_weight_only(): + plugin_weight_only_quant_type = torch.int8 + elif quant_mode.is_int4_weight_only(): + plugin_weight_only_quant_type = torch.quint4x2 + + # Do we use INT4/INT8 weight-only? + use_weight_only = quant_mode.is_weight_only() tensorrt_llm.logger.info('Loading weights from HF GPT-J...') tik = time.time() @@ -171,7 +560,21 @@ def load_from_hf_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM, layer_idx].mlp.proj.weights_scaling_factor.value = np.array( [scaling_factors['proj_weights'][layer_idx]], dtype=np.float32) - setattr(layer, 'value', v.to(torch_dtype).cpu().numpy()) + if use_weight_only and (idx == 2 or idx == 4): + processed_torch_weights, torch_weight_scales = \ + torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + v.transpose(0, 1).contiguous(), plugin_weight_only_quant_type + ) + layer.value = processed_torch_weights.numpy() + if idx == 2: + scales = tensorrt_llm_gpt_j.layers[ + layer_idx].mlp.fc.per_channel_scale + elif idx == 4: + scales = tensorrt_llm_gpt_j.layers[ + layer_idx].mlp.proj.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + setattr(layer, 'value', v.to(torch_dtype).cpu().numpy()) # Attention QKV Linear # concatenate the Q, K, V layers weights. @@ -181,7 +584,16 @@ def load_from_hf_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM, qkv_weights = torch.cat((q_weights, k_weights, v_weights)) layer = attrgetter("attention.qkv.weight")( tensorrt_llm_gpt_j.layers[layer_idx]) - setattr(layer, "value", qkv_weights.to(torch_dtype).cpu().numpy()) + if use_weight_only: + processed_torch_weights, torch_weight_scales = \ + torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + qkv_weights.transpose(0, 1).contiguous(), plugin_weight_only_quant_type) + layer.value = processed_torch_weights.numpy() + scales = tensorrt_llm_gpt_j.layers[ + layer_idx].attention.qkv.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + setattr(layer, "value", qkv_weights.to(torch_dtype).cpu().numpy()) if scaling_factors: tensorrt_llm_gpt_j.layers[ layer_idx].attention.qkv.activation_scaling_factor.value = np.array( @@ -206,7 +618,16 @@ def load_from_hf_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM, v = hf_gpt_j_state_dict.get(prefix + "attn.out_proj.weight") layer = attrgetter("attention.dense.weight")( tensorrt_llm_gpt_j.layers[layer_idx]) - setattr(layer, "value", v.to(torch_dtype).cpu().numpy()) + if use_weight_only: + processed_torch_weights, torch_weight_scales = \ + torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + v.transpose(0, 1).contiguous(), plugin_weight_only_quant_type) + layer.value = processed_torch_weights.numpy() + scales = tensorrt_llm_gpt_j.layers[ + layer_idx].attention.dense.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + setattr(layer, "value", v.to(torch_dtype).cpu().numpy()) if scaling_factors: tensorrt_llm_gpt_j.layers[ layer_idx].attention.dense.activation_scaling_factor.value = np.array( @@ -233,106 +654,13 @@ def load_from_hf_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM, tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}') -def AWQ_quantize_pack_preprocess(weight, scale, group_size, packer, - preprocessor): - scale = scale.repeat_interleave(group_size, dim=0) - weight = weight / scale - weight = torch.round(weight).char() - weight = torch.where(weight > 7, 7, weight) - qweight_int8 = torch.where(weight < -8, -8, weight) - int4_weight = packer(qweight_int8.cpu()) - int4_weight = preprocessor(int4_weight, torch.quint4x2) - return int4_weight.view(torch.int8).cpu().numpy() - - -def process_and_assign_weight(awq_gpt_j, mPrefix, mOp, group_size, packer, - preprocessor, torch_dtype): - weight = awq_gpt_j[mPrefix + ".weight"].T.contiguous() - [k, n] = weight.shape - amax = awq_gpt_j[mPrefix + ".weight_quantizer._amax"].reshape( - (n, int(k / group_size))).T.contiguous() - pre_quant_scale = awq_gpt_j[mPrefix + - ".input_quantizer._pre_quant_scale"].reshape( - (1, k)) - scale = amax / 8.0 - mOp.qweight.value = AWQ_quantize_pack_preprocess(weight, scale, group_size, - packer, preprocessor) - mOp.scale.value = scale.to(torch_dtype).cpu().numpy() - mOp.pre_quant_scale.value = pre_quant_scale.to(torch_dtype).cpu().numpy() - - -def deSmooth(weight, pre_quant_scale): - [k, n] = weight.shape - pre_quant_scale = pre_quant_scale.repeat((n, 1)).transpose(1, - 0).contiguous() - weight = weight * pre_quant_scale - return weight - - -def reSmooth(weight, pre_quant_scale): - [k, n] = weight.shape - pre_quant_scale = pre_quant_scale.repeat((n, 1)).transpose(1, - 0).contiguous() - weight = weight / pre_quant_scale - return weight - - -def get_scale(weight, group_size): - weight = weight.T.contiguous() - [n, k] = weight.shape - weight = weight.reshape(n, int(k / group_size), group_size) - weight = torch.abs(weight.reshape(-1, group_size)) - amax, idx = weight.max(1) - amax = amax.reshape(n, int(k / group_size)).T.contiguous() - return amax / 8 - - -def reSmooth_and_get_scale(weight, pre_quant_scale, avg_pre_quant_scale, - group_size): - weight = deSmooth(weight, pre_quant_scale) - weight = reSmooth(weight, avg_pre_quant_scale) - scale = get_scale(weight, group_size) - return weight, scale - - -def process_and_assign_qkv_weight(awq_gpt_j, prefix, mOp, group_size, packer, - preprocessor, torch_dtype): - q_weight = awq_gpt_j[prefix + "attn.q_proj.weight"].T.contiguous() - k_weight = awq_gpt_j[prefix + "attn.k_proj.weight"].T.contiguous() - v_weight = awq_gpt_j[prefix + "attn.v_proj.weight"].T.contiguous() - [k, n] = q_weight.shape - - q_pre_quant_scale = awq_gpt_j[ - prefix + "attn.q_proj.input_quantizer._pre_quant_scale"].reshape((1, k)) - k_pre_quant_scale = awq_gpt_j[ - prefix + "attn.k_proj.input_quantizer._pre_quant_scale"].reshape((1, k)) - v_pre_quant_scale = awq_gpt_j[ - prefix + "attn.v_proj.input_quantizer._pre_quant_scale"].reshape((1, k)) - - qkv_pre_quant_scale = (q_pre_quant_scale + k_pre_quant_scale + - v_pre_quant_scale) / 3.0 - q_weight, q_scale = reSmooth_and_get_scale(q_weight, q_pre_quant_scale, - qkv_pre_quant_scale, group_size) - k_weight, k_scale = reSmooth_and_get_scale(k_weight, k_pre_quant_scale, - qkv_pre_quant_scale, group_size) - v_weight, v_scale = reSmooth_and_get_scale(v_weight, v_pre_quant_scale, - qkv_pre_quant_scale, group_size) - - qkv_weights = torch.cat((q_weight, k_weight, v_weight), dim=1) - qkv_scale = torch.cat((q_scale, k_scale, v_scale), dim=1) - mOp.pre_quant_scale.value = qkv_pre_quant_scale.to( - torch_dtype).cpu().numpy() - mOp.qweight.value = AWQ_quantize_pack_preprocess(qkv_weights, qkv_scale, - group_size, packer, - preprocessor) - mOp.scale.value = qkv_scale.to(torch_dtype).cpu().numpy() - - def load_from_awq_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM, awq_gpt_j, config, + mapping=Mapping(), fp16=False, - group_size=128): + group_size=128, + ft_model_dir=None): awq_gptj_block_names = [ "ln_1.weight", @@ -348,7 +676,18 @@ def load_from_awq_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM, "mlp.proj.bias", ] - getattr(tensorrt_llm_gpt_j, 'quant_mode', QuantMode(0)) + def fromfile(dir_path, name, shape=None, dtype=None): + p = dir_path + '/' + name + if Path(p).exists(): + t = np.fromfile(p, dtype=dtype) + if shape is not None: + t = t.reshape(shape) + return t + return None + + quant_mode = getattr(tensorrt_llm_gpt_j, 'quant_mode', QuantMode(0)) + # Int8 KV cache + use_int8_kv_cache = quant_mode.has_int8_kv_cache() packer = torch.ops.fastertransformer.pack_int8_tensor_to_packed_int4 preprocessor = torch.ops.fastertransformer.preprocess_weights_for_mixed_gemm @@ -358,6 +697,103 @@ def load_from_awq_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM, torch_dtype = torch.float16 if fp16 else torch.float32 + def AWQ_quantize_pack_preprocess(weight, scale): + scale = scale.repeat_interleave(group_size, dim=0) + weight = weight / scale + qweight_int8 = torch.clamp(torch.round(weight.cuda()).char(), -8, 7) + int4_weight = packer(qweight_int8.cpu()) + int4_weight = preprocessor(int4_weight, torch.quint4x2) + return int4_weight.view(torch.int8).cpu().numpy() + + def process_and_assign_weight(awq_gpt_j, mPrefix, mOp, tp_dim=0): + weight = awq_gpt_j[mPrefix + ".weight"].T.contiguous() + [k, n] = weight.shape + weight = weight.split(weight.shape[tp_dim] // mapping.tp_size, + dim=tp_dim)[mapping.tp_rank] + amax = awq_gpt_j[mPrefix + ".weight_quantizer._amax"].reshape( + (n, int(k / group_size))).T.contiguous() + amax = amax.split(amax.shape[tp_dim] // mapping.tp_size, + dim=tp_dim)[mapping.tp_rank] + pre_quant_scale = awq_gpt_j[ + mPrefix + ".input_quantizer._pre_quant_scale"].reshape((1, k)) + if tp_dim == 0: + pre_quant_scale = pre_quant_scale.split(k // mapping.tp_size, + dim=1)[mapping.tp_rank] + scale = amax / 8.0 + mOp.qweight.value = AWQ_quantize_pack_preprocess(weight, scale) + mOp.scale.value = scale.to(torch_dtype).cpu().numpy() + mOp.pre_quant_scale.value = pre_quant_scale.to( + torch_dtype).cpu().numpy() + + def deSmooth(weight, pre_quant_scale): + [k, n] = weight.shape + pre_quant_scale = pre_quant_scale.repeat( + (n, 1)).transpose(1, 0).contiguous() + weight = weight * pre_quant_scale + return weight + + def reSmooth(weight, pre_quant_scale): + [k, n] = weight.shape + pre_quant_scale = pre_quant_scale.repeat( + (n, 1)).transpose(1, 0).contiguous() + weight = weight / pre_quant_scale + return weight + + def get_scale(weight): + weight = weight.T.contiguous() + [n, k] = weight.shape + weight = weight.reshape(n, int(k / group_size), group_size) + weight = torch.abs(weight.reshape(-1, group_size)) + amax, idx = weight.max(1) + amax = amax.reshape(n, int(k / group_size)).T.contiguous() + return amax / 8 + + def reSmooth_and_get_scale(weight, pre_quant_scale, avg_pre_quant_scale): + weight = deSmooth(weight, pre_quant_scale) + weight = reSmooth(weight, avg_pre_quant_scale) + scale = get_scale(weight) + return weight, scale + + def process_and_assign_qkv_weight(awq_gpt_j, prefix, mOp): + q_weight = awq_gpt_j[prefix + "attn.q_proj.weight"].T.contiguous() + k_weight = awq_gpt_j[prefix + "attn.k_proj.weight"].T.contiguous() + v_weight = awq_gpt_j[prefix + "attn.v_proj.weight"].T.contiguous() + k = q_weight.shape[0] + + q_weight = q_weight.split(q_weight.shape[1] // mapping.tp_size, + dim=1)[mapping.tp_rank] + k_weight = k_weight.split(k_weight.shape[1] // mapping.tp_size, + dim=1)[mapping.tp_rank] + v_weight = v_weight.split(v_weight.shape[1] // mapping.tp_size, + dim=1)[mapping.tp_rank] + + q_pre_quant_scale = awq_gpt_j[ + prefix + "attn.q_proj.input_quantizer._pre_quant_scale"].reshape( + (1, k)) + k_pre_quant_scale = awq_gpt_j[ + prefix + "attn.k_proj.input_quantizer._pre_quant_scale"].reshape( + (1, k)) + v_pre_quant_scale = awq_gpt_j[ + prefix + "attn.v_proj.input_quantizer._pre_quant_scale"].reshape( + (1, k)) + + qkv_pre_quant_scale = (q_pre_quant_scale + k_pre_quant_scale + + v_pre_quant_scale) / 3.0 + q_weight, q_scale = reSmooth_and_get_scale(q_weight, q_pre_quant_scale, + qkv_pre_quant_scale) + k_weight, k_scale = reSmooth_and_get_scale(k_weight, k_pre_quant_scale, + qkv_pre_quant_scale) + v_weight, v_scale = reSmooth_and_get_scale(v_weight, v_pre_quant_scale, + qkv_pre_quant_scale) + + qkv_weights = torch.cat((q_weight, k_weight, v_weight), dim=1) + qkv_scale = torch.cat((q_scale, k_scale, v_scale), dim=1) + + mOp.pre_quant_scale.value = qkv_pre_quant_scale.to( + torch_dtype).cpu().numpy() + mOp.qweight.value = AWQ_quantize_pack_preprocess(qkv_weights, qkv_scale) + mOp.scale.value = qkv_scale.to(torch_dtype).cpu().numpy() + #check if we need to pad vocab v = awq_gpt_j.get('transformer.wte.weight') [vocab_size, k] = v.shape @@ -379,6 +815,10 @@ def load_from_awq_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM, tensorrt_llm.logger.info(f'Process weights in layer: {layer_idx}') for idx, awq_attr in enumerate(awq_gptj_block_names): v = awq_gpt_j[prefix + awq_attr] + if awq_attr == "mlp.fc_in.bias": + v = v.split(v.shape[0] // mapping.tp_size, dim=0)[mapping.rank] + elif awq_attr == "mlp.fc_out.bias": + v = torch.zeros_like(v) if mapping.rank != 0 else v layer = attrgetter(tensorrt_llm_model_gptj_block_names[idx])( tensorrt_llm_gpt_j.layers[layer_idx]) setattr(layer, 'value', v.to(torch_dtype).cpu().numpy()) @@ -387,26 +827,34 @@ def load_from_awq_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM, # concatenate the Q, K, V layers weights. process_and_assign_qkv_weight( awq_gpt_j, prefix, - tensorrt_llm_gpt_j.layers[layer_idx].attention.qkv, group_size, - packer, preprocessor, torch_dtype) + tensorrt_llm_gpt_j.layers[layer_idx].attention.qkv) # Attention Dense (out_proj) Linear mPrefix = prefix + "attn.out_proj" mOp = tensorrt_llm_gpt_j.layers[layer_idx].attention.dense - process_and_assign_weight(awq_gpt_j, mPrefix, mOp, group_size, packer, - preprocessor, torch_dtype) + process_and_assign_weight(awq_gpt_j, mPrefix, mOp, 0) # MLP Dense (mlp.fc) Linear mPrefix = prefix + "mlp.fc_in" mOp = tensorrt_llm_gpt_j.layers[layer_idx].mlp.fc - process_and_assign_weight(awq_gpt_j, mPrefix, mOp, group_size, packer, - preprocessor, torch_dtype) + process_and_assign_weight(awq_gpt_j, mPrefix, mOp, 1) # MLP Dense (mlp.proj) Linear mPrefix = prefix + "mlp.fc_out" mOp = tensorrt_llm_gpt_j.layers[layer_idx].mlp.proj - process_and_assign_weight(awq_gpt_j, mPrefix, mOp, group_size, packer, - preprocessor, torch_dtype) + process_and_assign_weight(awq_gpt_j, mPrefix, mOp, 0) + + if use_int8_kv_cache: + assert ft_model_dir, "You must pass --ft_model_dir to tell TRT-LLM where to look for scales of INT8 kv cache." + t = fromfile( + ft_model_dir, 'model.layers.' + str(layer_idx) + + '.attention.query_key_value.scale_y_quant_orig.bin', [1], + np.float32) + assert t is not None, f"{ft_model_dir} does not contain model.layers.{layer_idx}.attention.query_key_value.scale_y_quant_orig.bin" + tensorrt_llm_gpt_j.layers[ + layer_idx].attention.kv_orig_quant_scale.value = 1.0 / t + tensorrt_llm_gpt_j.layers[ + layer_idx].attention.kv_quant_orig_scale.value = t v = awq_gpt_j['transformer.ln_f.weight'] tensorrt_llm_gpt_j.ln_f.weight.value = v.to(torch_dtype).cpu().numpy() @@ -421,14 +869,18 @@ def load_from_awq_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM, new_weight = torch.zeros([pad_vocab_size, k]) new_weight[:vocab_size, :] = weight new_weight = new_weight.T.contiguous() + new_weight = new_weight.split(new_weight.shape[1] // mapping.tp_size, + dim=1)[mapping.tp_rank] amax = awq_gpt_j['lm_head.weight_quantizer._amax'].reshape( [vocab_size, int(k / group_size)]) new_amax = torch.ones([pad_vocab_size, int(k / group_size)]) new_amax[:vocab_size, :] = amax new_amax = new_amax.T.contiguous() + new_amax = new_amax.split(new_amax.shape[1] // mapping.tp_size, + dim=1)[mapping.tp_rank] new_scale = new_amax / 8 tensorrt_llm_gpt_j.lm_head.qweight.value = AWQ_quantize_pack_preprocess( - new_weight, new_scale, group_size, packer, preprocessor) + new_weight, new_scale) tensorrt_llm_gpt_j.lm_head.scale.value = new_scale.to( torch_dtype).cpu().numpy() tensorrt_llm_gpt_j.lm_head.pre_quant_scale.value = awq_gpt_j[ @@ -438,13 +890,14 @@ def load_from_awq_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM, bias = awq_gpt_j['lm_head.bias'] new_bias = torch.zeros([pad_vocab_size]) new_bias[:vocab_size] = bias + new_bias = new_bias.split(pad_vocab_size // mapping.tp_size, + dim=0)[mapping.tp_rank] tensorrt_llm_gpt_j.lm_head.bias.value = new_bias.to( torch_dtype).cpu().numpy() else: mPrefix = "lm_head" mOp = tensorrt_llm_gpt_j.lm_head - process_and_assign_weight(awq_gpt_j, mPrefix, mOp, group_size, packer, - preprocessor, torch_dtype) + process_and_assign_weight(awq_gpt_j, mPrefix, mOp, 1) v = awq_gpt_j['lm_head.bias'] tensorrt_llm_gpt_j.lm_head.bias.value = v.to(torch_dtype).cpu().numpy() diff --git a/examples/gptneox/build.py b/examples/gptneox/build.py index ca5c50142c..01c55c32f8 100644 --- a/examples/gptneox/build.py +++ b/examples/gptneox/build.py @@ -28,8 +28,7 @@ import tensorrt_llm from tensorrt_llm.builder import Builder from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models import (weight_only_groupwise_quantize, - weight_only_quantize) +from tensorrt_llm.models import quantize_model from tensorrt_llm.network import net_guard from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.quantization import QuantMode @@ -269,15 +268,22 @@ def build_rank_engine(builder: Builder, use_parallel_embedding=args.use_parallel_embedding, embedding_sharding_dim=args.embedding_sharding_dim) - if args.use_weight_only_quant_matmul_plugin: - tensorrt_llm_gpt = weight_only_quantize(tensorrt_llm_gpt) - - if args.use_weight_only_groupwise_quant_matmul_plugin: - tensorrt_llm_gpt = weight_only_groupwise_quantize( - model=tensorrt_llm_gpt, - quant_mode=QuantMode(0), - group_size=128, - zero=True) + if args.use_weight_only_quant_matmul_plugin or args.use_weight_only_groupwise_quant_matmul_plugin: + quant_mode = QuantMode.from_description( + quantize_weights=True, + quantize_activations=False, + per_token=False, + per_channel=False, + per_group=args.use_weight_only_groupwise_quant_matmul_plugin, + use_int4_weights=False) + quantize_kwargs = {} + if args.use_weight_only_groupwise_quant_matmul_plugin: + quantize_kwargs = { + "group_size": 128, + "zero": True, + } + tensorrt_llm_gpt = quantize_model(tensorrt_llm_gpt, quant_mode, + **quantize_kwargs) if args.model_dir is not None: assert hf_gpt is not None, f'Could not load weights from hf_gpt model as it is not loaded yet.' @@ -341,6 +347,9 @@ def build_rank_engine(builder: Builder, if rank == 0: config_path = os.path.join(args.output_dir, 'config.json') builder.save_config(builder_config, config_path) + + tensorrt_llm.tools.cleanup(network, tensorrt_llm_gpt) + return engine diff --git a/examples/internlm/.gitignore b/examples/internlm/.gitignore new file mode 100644 index 0000000000..7ce339719a --- /dev/null +++ b/examples/internlm/.gitignore @@ -0,0 +1,2 @@ +internlm* +tokenizer.model diff --git a/examples/internlm/README.md b/examples/internlm/README.md new file mode 100644 index 0000000000..2f7a3cee7b --- /dev/null +++ b/examples/internlm/README.md @@ -0,0 +1,307 @@ +# InternLM + +This document shows how to build and run InternLM 7B / 20B models in TensorRT-LLM on both single GPU, single node multi-GPU and multi-node multi-GPU. + +## Overview + +The TensorRT-LLM InternLM implementation can be found in [tensorrt_llm/models/internlm/model.py](../../tensorrt_llm/models/internlm/model.py). The TensorRT-LLM InternLM example code is located in [`examples/internlm`](./). There are three main files in that folder:: + + * [`build.py`](./build.py) to build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run the InternLM model, + * [`run.py`](./run.py) to run the inference on an input text, + * [`summarize.py`](./summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset using the model. + +## Support Matrix + * FP16 / BF16 + * INT8 & INT4 Weight-Only + * Smooth Quant + * INT8 KV Cache + * Tensor Parallel & Pipeline Parallel + +## Usage + +The TensorRT-LLM InternLM example code locates at [examples/internlm](./). It takes HF weights as input, and builds the corresponding TensorRT engines. The number of TensorRT engines depends on the number of GPUs used to run inference. + +### Build TensorRT engine(s) + +TensorRT-LLM InternLM builds TensorRT engine(s) from HF checkpoint. If no checkpoint directory is specified, TensorRT-LLM will build engine(s) with dummy weights. + +InternLM has released several checkpoints of different size or capabilities under https://huggingface.co/internlm. Users can pick any one repository and follow instructions to prepare the checkpoint. + +Below examples use [internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b) and [internlm-chat-20b](https://huggingface.co/internlm/internlm-chat-20b) and assume these repositories are cloned or linked under this directory, for example `./internlm-chat-7b/`. + +Normally `build.py` only requires single GPU, but if you've already got all the GPUs needed while inferencing, you could enable parallel building to make the engine building process faster by adding `--parallel_build` argument. Please note that currently `parallel_build` feature only supports single node. + +Here're some examples: + +```bash +# Build a single-GPU float16 engine from HF weights. +# use_gpt_attention_plugin is necessary in InternLM. +# Try use_gemm_plugin to prevent accuracy issue. +# It is recommend to use --remove_input_padding along with --use_gpt_attention_plugin for better performance + +# Build the InternLM 7B model using a single GPU and FP16. +python build.py --model_dir ./internlm-chat-7b/ \ + --dtype float16 \ + --remove_input_padding \ + --use_gpt_attention_plugin float16 \ + --enable_context_fmha \ + --use_gemm_plugin float16 \ + --output_dir ./internlm-chat-7b/trt_engines/fp16/1-gpu/ + +# Build the InternLM 7B model using a single GPU and BF16. +python build.py --model_dir ./internlm-chat-7b/ \ + --dtype bfloat16 \ + --remove_input_padding \ + --use_gpt_attention_plugin bfloat16 \ + --enable_context_fmha \ + --use_gemm_plugin bfloat16 \ + --output_dir ./internlm-chat-7b/trt_engines/bf16/1-gpu/ + +# Build the InternLM 7B model using a single GPU and apply INT8 weight-only quantization. +python build.py --model_dir ./internlm-chat-7b/ \ + --dtype float16 \ + --remove_input_padding \ + --use_gpt_attention_plugin float16 \ + --enable_context_fmha \ + --use_gemm_plugin float16 \ + --use_weight_only \ + --output_dir ./internlm-chat-7b/trt_engines/weight_only/1-gpu/ + +# Note: setting `--weight_only_precision int4` to use INT4 weight-only quantization + +# Build InternLM 7B using 2-way tensor parallelism. +python build.py --model_dir ./internlm-chat-7b/ \ + --dtype float16 \ + --remove_input_padding \ + --use_gpt_attention_plugin float16 \ + --enable_context_fmha \ + --use_gemm_plugin float16 \ + --output_dir ./internlm-chat-7b/trt_engines/fp16/2-gpu/ \ + --world_size 2 \ + --tp_size 2 \ + --parallel_build + +# Build InternLM 20B using 2-way tensor parallelism and 2-way pipeline parallelism. +python build.py --model_dir ./internlm-chat-20b/ \ + --dtype bfloat16 \ + --remove_input_padding \ + --use_gpt_attention_plugin bfloat16 \ + --enable_context_fmha \ + --use_gemm_plugin bfloat16 \ + --output_dir ./internlm-chat-20b/trt_engines/bf16/4-gpu/ \ + --world_size 4 \ + --tp_size 2 \ + --pp_size 2 \ + --parallel_build +``` + +#### INT8 weight only + INT8 KV cache + +For INT8 KV cache, [`hf_internlm_convert.py`](./hf_internlm_convert.py) features a +`--calibrate-kv-cache, -kv` option. Setting `-kv` will calibrate the model, +and then export the scaling factors needed for INT8 KV cache inference. + + +Example: + +```bash +# For 7B models +python hf_internlm_convert.py -i ./internlm-chat-7b -o ./internlm-chat-7b/smooth_internlm/int8_kv_cache/ --calibrate-kv-cache -t fp16 +# For 20B models +python hf_internlm_convert.py -i ./internlm-chat-20b -o ./internlm-chat-20b/smooth_internlm/int8_kv_cache/ --calibrate-kv-cache -t fp16 +``` + +[`build.py`](./build.py) add new options for the support of INT8 KV cache. + +`--int8_kv_cache` is the command-line option to enable INT8 KV cache. + +In addition, it could be combined with INT8 weight-only quantization, as follows: + +Examples of INT8 weight-only quantization + INT8 KV cache + +```bash +# Build 7B model with both INT8 weight-only and INT8 KV cache enabled +python build.py --ft_model_dir=./internlm-chat-7b/smooth_internlm/int8_kv_cache/1-gpu/ \ + --dtype float16 \ + --use_gpt_attention_plugin float16 \ + --use_gemm_plugin float16 \ + --output_dir ./internlm-chat-7b/trt_engines/int8_kv_cache_weight_only/1-gpu \ + --int8_kv_cache \ + --use_weight_only + +# Build 20B model with both INT8 weight-only and INT8 KV cache enabled +python build.py --ft_model_dir=./internlm-chat-20b/smooth_internlm/int8_kv_cache/1-gpu/ \ + --dtype float16 \ + --use_gpt_attention_plugin float16 \ + --use_gemm_plugin float16 \ + --output_dir ./internlm-chat-20b/trt_engines/int8_kv_cache_weight_only/1-gpu \ + --int8_kv_cache \ + --use_weight_only +``` + +Test with `run.py` or `summarize.py`: + +```bash +python run.py --max_output_len=120 \ + --input_text 'Tell me about yourself.' \ + --tokenizer_dir ./internlm-chat-7b/ \ + --engine_dir ./internlm-chat-7b/trt_engines/int8_kv_cache_weight_only/1-gpu + +python run.py --max_output_len=120 \ + --input_text 'Tell me about yourself.' \ + --tokenizer_dir ./internlm-chat-20b/ \ + --engine_dir ./internlm-chat-20b/trt_engines/int8_kv_cache_weight_only/1-gpu + +python summarize.py --test_trt_llm --test_hf \ + --hf_model_location ./internlm-chat-7b \ + --data_type fp16 \ + --engine_dir ./internlm-chat-7b/trt_engines/int8_kv_cache_weight_only/1-gpu + +python summarize.py --test_trt_llm --test_hf \ + --hf_model_location ./internlm-chat-20b \ + --data_type fp16 \ + --engine_dir ./internlm-chat-20b/trt_engines/int8_kv_cache_weight_only/1-gpu +``` + +#### SmoothQuant + +Unlike the FP16 build where the HF weights are processed and loaded into the TensorRT-LLM directly, the SmoothQuant needs to load INT8 weights which should be pre-processed before building an engine. + +Example: +```bash +# For 7B models +python hf_internlm_convert.py -i ./internlm-chat-7b -o ./internlm-chat-7b/smooth_internlm/sq0.5/ -sq 0.5 --tensor-parallelism 1 --storage-type fp16 +# For 20B models +python hf_internlm_convert.py -i ./internlm-chat-20b -o ./internlm-chat-20b/smooth_internlm/sq0.5/ -sq 0.5 --tensor-parallelism 1 --storage-type fp16 +``` + +[`build.py`](./build.py) add new options for the support of INT8 inference of SmoothQuant models. + +`--use_smooth_quant` is the starting point of INT8 inference. By default, it +will run the model in the _per-tensor_ mode. + +Then, you can add any combination of `--per-token` and `--per-channel` to get the corresponding behaviors. + +Examples of build invocations: + +```bash +# Build model for SmoothQuant in the _per_tensor_ mode. +# 7B model +python build.py --ft_model_dir=./internlm-chat-7b/smooth_internlm/sq0.5/1-gpu/ \ + --use_smooth_quant \ + --output_dir ./internlm-chat-7b/trt_engines/smoothquant/1-gpu + +# 20B model +python build.py --ft_model_dir=./internlm-chat-20b/smooth_internlm/sq0.5/1-gpu/ \ + --use_smooth_quant \ + --output_dir ./internlm-chat-20b/trt_engines/smoothquant/1-gpu + +# OR build model for SmoothQuant in the _per_token_ + _per_channel_ mode +# 7B model +python build.py --ft_model_dir=./internlm-chat-7b/smooth_internlm/sq0.5/1-gpu/ \ + --use_smooth_quant \ + --per_token \ + --per_channel \ + --output_dir ./internlm-chat-7b/trt_engines/smoothquant/1-gpu + +# 20B model +python build.py --ft_model_dir=./internlm-chat-20b/smooth_internlm/sq0.5/1-gpu/ \ + --use_smooth_quant \ + --per_token \ + --per_channel \ + --output_dir ./internlm-chat-20b/trt_engines/smoothquant/1-gpu +``` + +Note we use `--ft_model_dir` instead of `--model_dir` and `--meta_ckpt_dir` since SmoothQuant model needs INT8 weights and various scales from the binary files. + +Test with `run.py` or `summarize.py`: + +```bash +python run.py --max_output_len=120 \ + --input_text 'Tell me about yourself.' \ + --tokenizer_dir ./internlm-chat-7b/ \ + --engine_dir ./internlm-chat-7b/trt_engines/smoothquant/1-gpu + +python run.py --max_output_len=120 \ + --input_text 'Tell me about yourself.' \ + --tokenizer_dir ./internlm-chat-20b/ \ + --engine_dir ./internlm-chat-20b/trt_engines/smoothquant/1-gpu + +python summarize.py --test_trt_llm --test_hf \ + --hf_model_location ./internlm-chat-7b \ + --data_type fp16 \ + --engine_dir ./internlm-chat-7b/trt_engines/smoothquant/1-gpu + +python summarize.py --test_trt_llm --test_hf \ + --hf_model_location ./internlm-chat-20b \ + --data_type fp16 \ + --engine_dir ./internlm-chat-20b/trt_engines/smoothquant/1-gpu +``` + +### Run + +To run a TensorRT-LLM InternLM model using the engines generated by build.py + +```bash +# InternLM 7B with fp16 +python run.py --max_output_len=120 \ + --input_text 'Tell me about yourself.' \ + --tokenizer_dir ./internlm-chat-7b/ \ + --engine_dir=./internlm-chat-7b/trt_engines/fp16/1-gpu/ + +# InternLM 7B with bf16 +python run.py --max_output_len=120 \ + --input_text 'Tell me about yourself.' \ + --tokenizer_dir ./internlm-chat-7b/ \ + --engine_dir=./internlm-chat-7b/trt_engines/bf16/1-gpu/ + +# InternLM 7B with int8 weight only quantization +python run.py --max_output_len=120 \ + --input_text 'Tell me about yourself.' \ + --tokenizer_dir ./internlm-chat-7b/ \ + --engine_dir=./internlm-chat-7b/trt_engines/weight_only/1-gpu/ + +# InternLM 7B with fp16 and tensor parallelism +mpirun -n 2 --allow-run-as-root \ + python run.py --max_output_len=120 \ + --input_text 'Tell me about yourself.' \ + --tokenizer_dir ./internlm-chat-7b/ \ + --engine_dir=./internlm-chat-7b/trt_engines/fp16/2-gpu/ + +# InternLM 20B with fp16 and tensor parallelism and pipeline parallelism +mpirun -n 4 --allow-run-as-root \ + python run.py --max_output_len=120 \ + --input_text 'Tell me about yourself.' \ + --tokenizer_dir ./internlm-chat-7b/ \ + --engine_dir=./internlm-chat-7b/trt_engines/bf16/4-gpu/ +``` + +### Summarization using the InternLM model + +```bash +# Run summarization using the InternLM 7B model in FP16. +python summarize.py --test_trt_llm --test_hf \ + --hf_model_location ./internlm-chat-7b/ \ + --data_type fp16 \ + --engine_dir ./internlm-chat-7b/trt_engines/fp16/1-gpu/ + +# Run summarization using the InternLM 7B model quantized to INT8. +python summarize.py --test_trt_llm --test_hf \ + --hf_model_location ./internlm-chat-7b/ \ + --data_type fp16 \ + --engine_dir ./internlm-chat-7b/trt_engines/weight_only/1-gpu/ + +# Run summarization using the InternLM 7B model in FP16 using two GPUs. +mpirun -n 2 --allow-run-as-root \ + python summarize.py --test_trt_llm --test_hf \ + --hf_model_location ./internlm-chat-7b/ \ + --data_type fp16 \ + --engine_dir ./internlm-chat-7b/trt_engines/fp16/2-gpu/ + +# Run summarization using the InternLM 20B model in BF16 using 4 GPUs. +mpirun -n 4 --allow-run-as-root \ + python summarize.py --test_trt_llm --test_hf \ + --hf_model_location ./internlm-chat-20b/ \ + --data_type bf16 \ + --engine_dir ./internlm-chat-20b/trt_engines/bf16/4-gpu/ +``` diff --git a/examples/internlm/build.py b/examples/internlm/build.py new file mode 100644 index 0000000000..f72706319f --- /dev/null +++ b/examples/internlm/build.py @@ -0,0 +1,724 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import json +import os +import time +from pathlib import Path + +import tensorrt as trt +import torch +import torch.multiprocessing as mp +from transformers import AutoConfig, AutoModelForCausalLM +from weight import (get_scaling_factors, load_from_awq_internlm, + load_from_binary, load_from_gptq_internlm, + load_from_hf_internlm, load_from_meta_internlm) + +import tensorrt_llm +from tensorrt_llm._utils import str_dtype_to_trt +from tensorrt_llm.builder import Builder +from tensorrt_llm.layers.attention import PositionEmbeddingType +from tensorrt_llm.logger import logger +from tensorrt_llm.mapping import Mapping +from tensorrt_llm.models import quantize_model +from tensorrt_llm.network import net_guard +from tensorrt_llm.plugin.plugin import ContextFMHAType +from tensorrt_llm.quantization import QuantMode + +from weight import parse_ft_config # isort:skip + +MODEL_NAME = "internlm" + +# 2 routines: get_engine_name, serialize_engine +# are direct copy from gpt example, TODO: put in utils? + +import onnx +import tensorrt as trt +from onnx import TensorProto, helper + + +def trt_dtype_to_onnx(dtype): + if dtype == trt.float16: + return TensorProto.DataType.FLOAT16 + elif dtype == trt.float32: + return TensorProto.DataType.FLOAT + elif dtype == trt.int32: + return TensorProto.DataType.INT32 + else: + raise TypeError("%s is not supported" % dtype) + + +def to_onnx(network, path): + inputs = [] + for i in range(network.num_inputs): + network_input = network.get_input(i) + inputs.append( + helper.make_tensor_value_info( + network_input.name, trt_dtype_to_onnx(network_input.dtype), + list(network_input.shape))) + + outputs = [] + for i in range(network.num_outputs): + network_output = network.get_output(i) + outputs.append( + helper.make_tensor_value_info( + network_output.name, trt_dtype_to_onnx(network_output.dtype), + list(network_output.shape))) + + nodes = [] + for i in range(network.num_layers): + layer = network.get_layer(i) + layer_inputs = [] + for j in range(layer.num_inputs): + ipt = layer.get_input(j) + if ipt is not None: + layer_inputs.append(layer.get_input(j).name) + layer_outputs = [ + layer.get_output(j).name for j in range(layer.num_outputs) + ] + nodes.append( + helper.make_node(str(layer.type), + name=layer.name, + inputs=layer_inputs, + outputs=layer_outputs, + domain="com.nvidia")) + + onnx_model = helper.make_model(helper.make_graph(nodes, + 'attention', + inputs, + outputs, + initializer=None), + producer_name='NVIDIA') + onnx.save(onnx_model, path) + + +def get_engine_name(model, dtype, tp_size, pp_size, rank): + if pp_size == 1: + return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank) + return '{}_{}_tp{}_pp{}_rank{}.engine'.format(model, dtype, tp_size, + pp_size, rank) + + +def serialize_engine(engine, path): + logger.info(f'Serializing engine to {path}...') + tik = time.time() + with open(path, 'wb') as f: + f.write(bytearray(engine)) + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + logger.info(f'Engine serialized. Total time: {t}') + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--world_size', type=int, default=1) + parser.add_argument('--tp_size', type=int, default=1) + parser.add_argument('--pp_size', type=int, default=1) + parser.add_argument('--model_dir', type=str, default=None) + parser.add_argument('--ft_model_dir', type=str, default=None) + parser.add_argument('--meta_ckpt_dir', type=str, default=None) + parser.add_argument('--quant_ckpt_path', type=str, default=None) + parser.add_argument('--dtype', + type=str, + default='float16', + choices=['float32', 'bfloat16', 'float16']) + parser.add_argument( + '--timing_cache', + type=str, + default='model.cache', + help= + 'The path of to read timing cache from, will be ignored if the file does not exist' + ) + parser.add_argument('--log_level', type=str, default='info') + parser.add_argument('--vocab_size', type=int, default=103168) + parser.add_argument('--n_layer', type=int, default=32) + parser.add_argument('--n_positions', type=int, default=2048) + parser.add_argument('--n_embd', type=int, default=4096) + parser.add_argument('--n_head', type=int, default=32) + parser.add_argument('--n_kv_head', type=int, default=None) + parser.add_argument('--multiple_of', type=int, default=256) + parser.add_argument('--ffn_dim_multiplier', type=float, default=1.0) + parser.add_argument('--inter_size', type=int, default=None) + parser.add_argument('--hidden_act', type=str, default='silu') + parser.add_argument('--rms_norm_eps', type=float, default=1e-06) + parser.add_argument('--max_batch_size', type=int, default=8) + parser.add_argument('--max_input_len', type=int, default=2048) + parser.add_argument('--max_output_len', type=int, default=512) + parser.add_argument('--max_beam_width', type=int, default=1) + parser.add_argument('--rotary_base', type=float, default=10000.0) + parser.add_argument('--rotary_scaling', nargs=2, type=str, default=None) + parser.add_argument('--use_gpt_attention_plugin', + nargs='?', + const='float16', + type=str, + default=False, + choices=['float16', 'bfloat16', 'float32']) + parser.add_argument('--use_gemm_plugin', + nargs='?', + const='float16', + type=str, + default=False, + choices=['float16', 'bfloat16', 'float32']) + parser.add_argument('--use_rmsnorm_plugin', + nargs='?', + const='float16', + type=str, + default=False, + choices=['float16', 'float32', 'bfloat16']) + parser.add_argument('--parallel_build', default=False, action='store_true') + parser.add_argument('--enable_context_fmha', + default=False, + action='store_true') + parser.add_argument('--enable_context_fmha_fp32_acc', + default=False, + action='store_true') + parser.add_argument('--visualize', default=False, action='store_true') + parser.add_argument('--enable_debug_output', + default=False, + action='store_true') + parser.add_argument('--gpus_per_node', type=int, default=8) + parser.add_argument('--builder_opt', type=int, default=None) + parser.add_argument( + '--output_dir', + type=str, + default='internlm_outputs', + help= + 'The path to save the serialized engine files, timing cache file and model configs' + ) + parser.add_argument('--remove_input_padding', + default=False, + action='store_true') + + # Arguments related to the quantization of the model. + parser.add_argument( + '--use_smooth_quant', + default=False, + action="store_true", + help= + 'Use the SmoothQuant method to quantize activations and weights for the various GEMMs.' + 'See --per_channel and --per_token for finer-grained quantization options.' + ) + parser.add_argument( + '--per_channel', + default=False, + action="store_true", + help= + 'By default, we use a single static scaling factor for the GEMM\'s result. ' + 'per_channel instead uses a different static scaling factor for each channel. ' + 'The latter is usually more accurate, but a little slower.') + parser.add_argument( + '--per_token', + default=False, + action="store_true", + help= + 'By default, we use a single static scaling factor to scale activations in the int8 range. ' + 'per_token chooses at run time, and for each token, a custom scaling factor. ' + 'The latter is usually more accurate, but a little slower.') + parser.add_argument( + '--per_group', + default=False, + action="store_true", + help= + 'By default, we use a single static scaling factor to scale weights in the int4 range. ' + 'per_group chooses at run time, and for each group, a custom scaling factor. ' + 'The flag is built for GPTQ/AWQ quantization.') + parser.add_argument('--group_size', + type=int, + default=128, + help='Group size used in GPTQ/AWQ quantization.') + parser.add_argument( + '--int8_kv_cache', + default=False, + action="store_true", + help= + 'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV' + ) + parser.add_argument( + '--use_parallel_embedding', + action="store_true", + default=False, + help= + 'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled' + ) + parser.add_argument( + '--embedding_sharding_dim', + type=int, + default=1, # Meta does TP on hidden dim + choices=[0, 1], + help= + 'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). ' + 'To shard it along hidden dimension, set embedding_sharding_dim=1' + 'Note: embedding sharing is only enabled when embedding_sharding_dim = 0' + ) + parser.add_argument( + '--enable_fp8', + default=False, + action='store_true', + help='Use FP8 Linear layer for Attention QKV/Dense and MLP.') + parser.add_argument( + '--fp8_kv_cache', + default=False, + action="store_true", + help= + 'By default, we use dtype for KV cache. fp8_kv_cache chooses int8 quantization for KV' + ) + parser.add_argument( + '--quantized_fp8_model_path', + type=str, + default=None, + help='Path of a quantized model checkpoint in .npz format') + parser.add_argument( + '--use_weight_only', + default=False, + action="store_true", + help='Quantize weights for the various GEMMs to INT4/INT8.' + 'See --weight_only_precision to set the precision') + parser.add_argument( + '--weight_only_precision', + const='int8', + type=str, + nargs='?', + default='int8', + choices=['int8', 'int4', 'int4_awq', 'int4_gptq'], + help= + 'Define the precision for the weights when using weight-only quantization.' + 'You must also use --use_weight_only for that argument to have an impact.' + ) + parser.add_argument( + '--use_inflight_batching', + action="store_true", + default=False, + help="Activates inflight batching mode of gptAttentionPlugin.") + parser.add_argument( + '--paged_kv_cache', + action="store_true", + default=False, + help= + 'By default we use contiguous KV cache. By setting this flag you enable paged KV cache' + ) + parser.add_argument('--tokens_per_block', + type=int, + default=64, + help='Number of tokens per block in paged KV cache') + parser.add_argument( + '--max_num_tokens', + type=int, + default=None, + help='Define the max number of tokens supported by the engine') + parser.add_argument( + '--strongly_typed', + default=False, + action="store_true", + help= + 'This option is introduced with trt 9.1.0.1+ and will reduce the building time significantly for fp8.' + ) + parser.add_argument( + '--use_custom_all_reduce', + action='store_true', + help= + 'Activates latency-optimized algorithm for all-reduce instead of NCCL.') + + args = parser.parse_args() + tensorrt_llm.logger.set_level(args.log_level) + + assert not ( + args.use_smooth_quant and args.use_weight_only + ), "You cannot enable both SmoothQuant and INT8 weight-only together." + + if not args.remove_input_padding: + if args.use_gpt_attention_plugin: + logger.warning( + f"It is recommended to specify --remove_input_padding when using GPT attention plugin" + ) + + if args.use_inflight_batching: + if not args.use_gpt_attention_plugin: + args.use_gpt_attention_plugin = 'float16' + logger.info( + f"Using GPT attention plugin for inflight batching mode. Setting to default '{args.use_gpt_attention_plugin}'" + ) + if not args.remove_input_padding: + args.remove_input_padding = True + logger.info( + "Using remove input padding for inflight batching mode.") + if not args.paged_kv_cache: + args.paged_kv_cache = True + logger.info("Using paged KV cache for inflight batching mode.") + + if args.use_smooth_quant: + args.quant_mode = QuantMode.use_smooth_quant(args.per_token, + args.per_channel) + elif args.use_weight_only: + if args.per_group: + args.quant_mode = QuantMode.from_description( + quantize_weights=True, + quantize_activations=False, + per_token=False, + per_channel=False, + per_group=True, + use_int4_weights=True) + else: + args.quant_mode = QuantMode.use_weight_only( + args.weight_only_precision == 'int4') + else: + args.quant_mode = QuantMode(0) + + if args.int8_kv_cache: + args.quant_mode = args.quant_mode.set_int8_kv_cache() + elif args.fp8_kv_cache: + args.quant_mode = args.quant_mode.set_fp8_kv_cache() + if args.enable_fp8: + args.quant_mode = args.quant_mode.set_fp8_qdq() + + if args.rotary_scaling is not None: + rotary_scaling = { + "type": args.rotary_scaling[0], + "factor": float(args.rotary_scaling[1]) + } + assert rotary_scaling["type"] in ["linear", "dynamic"] + assert rotary_scaling["factor"] > 1.0 + args.rotary_scaling = rotary_scaling + if rotary_scaling["type"] == "dynamic": + assert not args.remove_input_padding, "TODO: Not supported yet" + + # Since gpt_attenttion_plugin is the only way to apply RoPE now, + # force use the plugin for now with the correct data type. + args.use_gpt_attention_plugin = args.dtype + if args.model_dir is not None: + hf_config = AutoConfig.from_pretrained(args.model_dir, + trust_remote_code=True) + args.inter_size = hf_config.intermediate_size # override the inter_size for InternLM + args.n_embd = hf_config.hidden_size + args.n_head = hf_config.num_attention_heads + if hasattr(hf_config, "num_key_value_heads"): + args.n_kv_head = hf_config.num_key_value_heads + args.n_layer = hf_config.num_hidden_layers + args.n_positions = hf_config.max_position_embeddings + args.vocab_size = hf_config.vocab_size + args.hidden_act = hf_config.hidden_act + args.rms_norm_eps = hf_config.rms_norm_eps + args.attn_bias = hf_config.bias + elif args.meta_ckpt_dir is not None: + # Not tested + with open(Path(args.meta_ckpt_dir, "params.json")) as fp: + meta_config: dict = json.load(fp) + args.n_embd = meta_config["dim"] + args.n_head = meta_config["n_heads"] + args.n_layer = meta_config["n_layers"] + args.n_kv_head = meta_config.get("n_kv_heads", args.n_head) + args.multiple_of = meta_config["multiple_of"] + args.ffn_dim_multiplier = meta_config.get("ffn_dim_multiplier", 1) + n_embd = int(4 * args.n_embd * 2 / 3) + args.inter_size = args.multiple_of * ( + (int(n_embd * args.ffn_dim_multiplier) + args.multiple_of - 1) // + args.multiple_of) + args.rms_norm_eps = meta_config["norm_eps"] + elif args.ft_model_dir is not None: + n_embd, n_head, n_layer, n_positions, vocab_size, hidden_act, inter_size, n_kv_head, attn_bias = parse_ft_config( + Path(args.ft_model_dir) / "config.ini") + args.inter_size = inter_size # override the inter_size for InternLM + args.n_kv_head = n_kv_head + args.n_embd = n_embd + args.n_head = n_head + args.n_layer = n_layer + args.n_positions = n_positions + args.vocab_size = vocab_size + args.hidden_act = hidden_act + args.rms_norm_eps = 1e-06 + logger.warning("Set rms_norm_eps to 1e-06 directly.") + args.attn_bias = attn_bias + assert args.use_gpt_attention_plugin, "InternLM must use gpt attention plugin" + if args.n_kv_head is None: + args.n_kv_head = args.n_head + elif args.n_kv_head != args.n_head: + assert (args.n_head % args.n_kv_head) == 0, \ + "MQA/GQA requires the number of heads to be divisible by the number of K/V heads." + assert (args.n_kv_head % args.tp_size) == 0 or (args.tp_size % args.n_kv_head) == 0, \ + "MQA/GQA requires either the number of K/V heads to be divisible by the tensor parallelism size OR " \ + "the tensor parallelism size to be divisible by the number of K/V heads." + + if args.dtype == 'bfloat16': + assert args.use_gemm_plugin, "Please use gemm plugin when dtype is bfloat16" + + assert args.pp_size * args.tp_size == args.world_size + + if args.max_num_tokens is not None: + assert args.enable_context_fmha + + if args.inter_size is None: + # this should not be need when loading a real model + # but it is helpful when creating a dummy model without loading any real weights + n_embd = int(4 * args.n_embd * 2 / 3) + args.inter_size = args.multiple_of * ( + (int(n_embd * args.ffn_dim_multiplier) + args.multiple_of - 1) // + args.multiple_of) + logger.info(f"Setting inter_size to {args.inter_size}.") + + return args + + +def build_rank_engine(builder: Builder, + builder_config: tensorrt_llm.builder.BuilderConfig, + engine_name, rank, args): + ''' + @brief: Build the engine on the given rank. + @param rank: The rank to build the engine. + @param args: The cmd line arguments. + @return: The built engine. + ''' + dtype = str_dtype_to_trt(args.dtype) + mapping = Mapping(world_size=args.world_size, + rank=rank, + tp_size=args.tp_size, + pp_size=args.pp_size) + + assert args.n_layer % args.pp_size == 0, \ + f"num_layers {args.n_layer} must be a multiple of pipeline parallelism size {args.pp_size}" + + # Initialize Module + tensorrt_llm_internlm = tensorrt_llm.models.InternLMForCausalLM( + num_layers=args.n_layer, + num_heads=args.n_head, + num_kv_heads=args.n_kv_head, + hidden_size=args.n_embd, + vocab_size=args.vocab_size, + hidden_act=args.hidden_act, + attn_bias=args.attn_bias, + max_position_embeddings=args.n_positions, + dtype=dtype, + mlp_hidden_size=args.inter_size, + position_embedding_type=PositionEmbeddingType.rope_gpt_neox, + mapping=mapping, + rotary_base=args.rotary_base, + rotary_scaling=args.rotary_scaling, + use_parallel_embedding=args.use_parallel_embedding, + embedding_sharding_dim=args.embedding_sharding_dim, + quant_mode=args.quant_mode, + rms_norm_eps=args.rms_norm_eps) + if args.use_smooth_quant: + tensorrt_llm_internlm = quantize_model(tensorrt_llm_internlm, + args.quant_mode) + elif args.use_weight_only: + if args.weight_only_precision == 'int8': + tensorrt_llm_internlm = quantize_model(tensorrt_llm_internlm, + args.quant_mode) + elif args.weight_only_precision == 'int4': + tensorrt_llm_internlm = quantize_model(tensorrt_llm_internlm, + args.quant_mode) + elif args.weight_only_precision == 'int4_awq': + tensorrt_llm_internlm = quantize_model(model=tensorrt_llm_internlm, + quant_mode=args.quant_mode, + group_size=args.group_size, + zero=False, + pre_quant_scale=True, + exclude_modules=[]) + elif args.weight_only_precision == 'int4_gptq': + tensorrt_llm_internlm = quantize_model(model=tensorrt_llm_internlm, + quant_mode=args.quant_mode, + group_size=args.group_size, + zero=True, + pre_quant_scale=False) + elif args.enable_fp8 or args.fp8_kv_cache: + logger.info(f'Loading scaling factors from ' + f'{args.quantized_fp8_model_path}') + quant_scales = get_scaling_factors(args.quantized_fp8_model_path, + num_layers=args.n_layer, + quant_mode=args.quant_mode) + tensorrt_llm_internlm = quantize_model(tensorrt_llm_internlm, + quant_mode=args.quant_mode, + quant_scales=quant_scales) + if args.per_group: + load_func = load_from_awq_internlm if args.weight_only_precision == 'int4_awq' else load_from_gptq_internlm + load_func(tensorrt_llm_internlm=tensorrt_llm_internlm, + quant_ckpt_path=args.quant_ckpt_path, + mapping=mapping, + dtype=args.dtype) + elif args.meta_ckpt_dir is not None: + load_from_meta_internlm(tensorrt_llm_internlm, args.meta_ckpt_dir, + mapping, args.dtype) + elif args.model_dir is not None: + logger.info(f'Loading HF InternLM ... from {args.model_dir}') + tik = time.time() + hf_internlm = AutoModelForCausalLM.from_pretrained( + args.model_dir, + device_map={ + "model": "cpu", + "lm_head": "cpu" + }, # Load to CPU memory + torch_dtype="auto", + trust_remote_code=True) + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + logger.info(f'HF InternLM loaded. Total time: {t}') + load_from_hf_internlm(tensorrt_llm_internlm, + hf_internlm, + mapping=mapping, + dtype=args.dtype) + del hf_internlm + elif args.ft_model_dir is not None: + load_from_binary(tensorrt_llm_internlm, + args.ft_model_dir, + mapping, + fp16=(args.dtype == 'float16'), + multi_query_mode=(args.n_kv_head != args.n_head)) + + # Module -> Network + network = builder.create_network() + network.trt_network.name = engine_name + if args.use_gpt_attention_plugin: + network.plugin_config.set_gpt_attention_plugin( + dtype=args.use_gpt_attention_plugin) + if args.use_gemm_plugin: + network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin) + if args.use_rmsnorm_plugin: + network.plugin_config.set_rmsnorm_plugin(dtype=args.use_rmsnorm_plugin) + + # Quantization plugins. + if args.use_smooth_quant: + network.plugin_config.set_smooth_quant_gemm_plugin(dtype=args.dtype) + network.plugin_config.set_rmsnorm_quantization_plugin(dtype=args.dtype) + network.plugin_config.set_quantize_tensor_plugin() + network.plugin_config.set_quantize_per_token_plugin() + assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc) + if args.enable_context_fmha: + network.plugin_config.set_context_fmha(ContextFMHAType.enabled) + if args.enable_context_fmha_fp32_acc: + network.plugin_config.set_context_fmha( + ContextFMHAType.enabled_with_fp32_acc) + if args.use_weight_only: + if args.per_group: + network.plugin_config.set_weight_only_groupwise_quant_matmul_plugin( + dtype='float16') + else: + network.plugin_config.set_weight_only_quant_matmul_plugin( + dtype='float16') + if args.world_size > 1: + network.plugin_config.set_nccl_plugin(args.dtype, + args.use_custom_all_reduce) + if args.remove_input_padding: + network.plugin_config.enable_remove_input_padding() + if args.paged_kv_cache: + network.plugin_config.enable_paged_kv_cache(args.tokens_per_block) + + with net_guard(network): + # Prepare + network.set_named_parameters(tensorrt_llm_internlm.named_parameters()) + + # Forward + inputs = tensorrt_llm_internlm.prepare_inputs(args.max_batch_size, + args.max_input_len, + args.max_output_len, True, + args.max_beam_width, + args.max_num_tokens) + tensorrt_llm_internlm(*inputs) + if args.enable_debug_output: + # mark intermediate nodes' outputs + for k, v in tensorrt_llm_internlm.named_network_outputs(): + v = v.trt_tensor + v.name = k + network.trt_network.mark_output(v) + v.dtype = dtype + if args.visualize: + model_path = os.path.join(args.output_dir, 'test.onnx') + to_onnx(network.trt_network, model_path) + + tensorrt_llm.graph_rewriting.optimize(network) + + engine = None + + # Network -> Engine + engine = builder.build_engine(network, builder_config) + if rank == 0: + config_path = os.path.join(args.output_dir, 'config.json') + builder.save_config(builder_config, config_path) + return engine + + +def build(rank, args): + torch.cuda.set_device(rank % args.gpus_per_node) + logger.set_level(args.log_level) + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + # when doing serializing build, all ranks share one engine + builder = Builder() + + cache = None + for cur_rank in range(args.world_size): + # skip other ranks if parallel_build is enabled + if args.parallel_build and cur_rank != rank: + continue + # NOTE: when only int8 kv cache is used together with paged kv cache no int8 tensors are exposed to TRT + int8_trt_flag = args.quant_mode.has_act_or_weight_quant() or ( + not args.paged_kv_cache and args.quant_mode.has_int8_kv_cache()) + builder_config = builder.create_builder_config( + name=MODEL_NAME, + precision=args.dtype, + timing_cache=args.timing_cache if cache is None else cache, + tensor_parallel=args.tp_size, + pipeline_parallel=args.pp_size, + parallel_build=args.parallel_build, + num_layers=args.n_layer, + num_heads=args.n_head, + num_kv_heads=args.n_kv_head, + hidden_size=args.n_embd, + vocab_size=args.vocab_size, + hidden_act=args.hidden_act, + max_position_embeddings=args.n_positions, + max_batch_size=args.max_batch_size, + max_input_len=args.max_input_len, + max_output_len=args.max_output_len, + max_num_tokens=args.max_num_tokens, + int8=int8_trt_flag, + fp8=args.quant_mode.has_fp8_qdq(), + quant_mode=args.quant_mode, + strongly_typed=args.strongly_typed, + opt_level=args.builder_opt) + engine_name = get_engine_name(MODEL_NAME, args.dtype, args.tp_size, + args.pp_size, cur_rank) + engine = build_rank_engine(builder, builder_config, engine_name, + cur_rank, args) + assert engine is not None, f'Failed to build engine for rank {cur_rank}' + + if cur_rank == 0: + # Use in-memory timing cache for multiple builder passes. + if not args.parallel_build: + cache = builder_config.trt_builder_config.get_timing_cache() + + serialize_engine(engine, os.path.join(args.output_dir, engine_name)) + + if rank == 0: + ok = builder.save_timing_cache( + builder_config, os.path.join(args.output_dir, "model.cache")) + assert ok, "Failed to save timing cache." + + +if __name__ == '__main__': + args = parse_arguments() + tik = time.time() + if args.parallel_build and args.world_size > 1 and \ + torch.cuda.device_count() >= args.world_size: + logger.warning( + f'Parallelly build TensorRT engines. Please make sure that all of the {args.world_size} GPUs are totally free.' + ) + mp.spawn(build, nprocs=args.world_size, args=(args, )) + else: + args.parallel_build = False + logger.info('Serially build TensorRT engines.') + build(0, args) + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + logger.info(f'Total time of building all {args.world_size} engines: {t}') diff --git a/examples/internlm/convert.py b/examples/internlm/convert.py new file mode 100644 index 0000000000..26831c1c54 --- /dev/null +++ b/examples/internlm/convert.py @@ -0,0 +1,322 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + Utilities for exporting a model to our custom format. +""" +import numpy as np +import torch + + +def save_val(val, dir, key, tp_num=None): + suffix = "bin" if tp_num is None else f"{tp_num}.bin" + val.tofile(dir / f"model.{key}.{suffix}") + + +def save_split(split_vals, dir, key, i, factor): + for j, val in enumerate(split_vals): + save_val(val, dir, key, i * factor + j) + + +def generate_int8(weights, act_range, is_qkv=False, multi_query_mode=False): + """ + This function has two purposes: + - compute quantized weights, scaled either per-tensor or per-column + - compute scaling factors + + Depending on the GEMM API (CUTLASS/CUBLAS) the required scaling factors differ. + CUTLASS uses two sets of scaling factors. One for the activation X, one for the weight W. + CUBLAS only has one (we can't do per-row scaling). So we must provide pre-multiplied scaling factor. + + Here is the list of what we need (T means per-tensor, C per-column): + - scale_x_orig_quant puts fp activation into the quantized range (i.e. [-128, 127], for int8). Used before the GEMM. (T) + - scale_y_quant_orig puts quantized activation into the fp range. Used if the GEMM outputs int8. (T) + - scale_w_quant_orig puts weights from quant range to fp range (used with CUTLASS) (T, C) + - scale_y_accum_quant puts the GEMM result (XW) from accumulation range (int32) + to quant range (int8) (used for CUBLAS) (T, C) + + Note that we don't do anything special about row-parallel GEMM. Theoretically, we could have per-GPU scaling factors too, + but then the model would change depending on the number of GPUs used. + + For QKV projection, the behavior is special. Even if we have a single matrix to perform QKV projection, we consider it + as three different matrices: Q, K, and V. So per-tensor actually means one scaling factor for each Q, K and V. + For our GEMM implementation to respect this behavior, we use per-column mode and replicate values along columns. + """ + + # compute weight scaling factors for fp->int8 and int8->fp + if is_qkv and not multi_query_mode: + scale_w_orig_quant_t = 127. / act_range["w"].reshape(3, -1).max( + dim=-1, keepdims=True)[0].cpu().numpy() + scale_w_orig_quant_c = 127. / act_range["w"].reshape(3, + -1).cpu().numpy() + elif is_qkv and multi_query_mode: + hidden_dim = weights.shape[0] + local_dim = act_range["w"].shape[0] + kv_dim = (local_dim - hidden_dim) // 2 + scale_w_q = act_range["w"][0:hidden_dim] + scale_w_k = act_range["w"][hidden_dim:hidden_dim + kv_dim] + scale_w_v = act_range["w"][-kv_dim:] + + scale_w_qkv_t = torch.concat([ + scale_w_q.max(dim=0, keepdim=True)[0], + scale_w_k.max(dim=0, keepdim=True)[0], + scale_w_v.max(dim=0, keepdim=True)[0] + ]) + + scale_w_orig_quant_t = 127. / scale_w_qkv_t.cpu().numpy() + scale_w_orig_quant_c = 127. / act_range["w"].cpu().numpy() + else: + scale_w_orig_quant_t = 127. / act_range["w"].max().cpu().numpy() + scale_w_orig_quant_c = 127. / act_range["w"].cpu().numpy() + scale_w_quant_orig_t = 1.0 / scale_w_orig_quant_t + scale_w_quant_orig_c = 1.0 / scale_w_orig_quant_c + + # compute the rest of needed scaling factors + scale_x_orig_quant_t = np.array(127. / act_range["x"].max().item()) + scale_y_orig_quant_t = np.array(127. / act_range["y"].max().item()) + scale_y_quant_orig_t = np.array(act_range["y"].max().item() / 127.) + scale_y_accum_quant_t = scale_y_orig_quant_t / (scale_x_orig_quant_t * + scale_w_orig_quant_t) + scale_y_accum_quant_c = scale_y_orig_quant_t / (scale_x_orig_quant_t * + scale_w_orig_quant_c) + if is_qkv and not multi_query_mode: + scale_y_accum_quant_t = np.broadcast_to(scale_y_accum_quant_t, + scale_w_orig_quant_c.shape) + scale_w_quant_orig_t = np.broadcast_to(scale_w_quant_orig_t, + scale_w_orig_quant_c.shape) + if is_qkv and multi_query_mode: + scale_q_y_accum_t = np.broadcast_to(scale_y_accum_quant_t[0], + scale_w_q.shape) + scale_k_y_accum_t = np.broadcast_to(scale_y_accum_quant_t[1], + scale_w_k.shape) + scale_v_y_accum_t = np.broadcast_to(scale_y_accum_quant_t[2], + scale_w_v.shape) + scale_y_accum_quant_t = np.concatenate( + [scale_q_y_accum_t, scale_k_y_accum_t, scale_v_y_accum_t]) + scale_w_quant_orig_t = np.concatenate([ + np.broadcast_to(scale_w_quant_orig_t[0], scale_w_q.shape), + np.broadcast_to(scale_w_quant_orig_t[1], scale_w_k.shape), + np.broadcast_to(scale_w_quant_orig_t[2], scale_w_v.shape) + ]) + + to_i8 = lambda x: x.round().clip(-127, 127).astype(np.int8) + + if is_qkv and multi_query_mode: + scale_w_quant_orig_t_expand = np.ones([weights.shape[-1]]) + scale_w_quant_orig_t_expand[:hidden_dim] = scale_w_quant_orig_t[0] + scale_w_quant_orig_t_expand[hidden_dim:hidden_dim + + kv_dim] = scale_w_quant_orig_t[1] + scale_w_quant_orig_t_expand[-kv_dim:] = scale_w_quant_orig_t[2] + weight_int8 = to_i8(weights * scale_w_quant_orig_t_expand) + else: + weight_int8 = to_i8(weights * scale_w_orig_quant_t) + return { + "weight.int8": weight_int8, + "weight.int8.col": to_i8(weights * scale_w_orig_quant_c), + "scale_x_orig_quant": scale_x_orig_quant_t.astype(np.float32), + "scale_w_quant_orig": scale_w_quant_orig_t.astype(np.float32), + "scale_w_quant_orig.col": scale_w_quant_orig_c.astype(np.float32), + "scale_y_accum_quant": scale_y_accum_quant_t.astype(np.float32), + "scale_y_accum_quant.col": scale_y_accum_quant_c.astype(np.float32), + "scale_y_quant_orig": scale_y_quant_orig_t.astype(np.float32), + } + + +def save_multi_query_mode_qkv_int8(val, dir, base_key, saved_key, factor, rank, + local_dim, head_size): + q, k, v = np.split(val, [local_dim, local_dim + head_size], axis=-1) + q_split = np.split(q, factor, axis=-1) + k_split = np.split(k, factor, axis=-1) + v_split = np.split(v, factor, axis=-1) + split_vals = [ + np.concatenate((q_split[ii], k_split[ii], v_split[ii]), axis=-1) + for ii in range(factor) + ] + save_split(split_vals, dir, f"{base_key}.{saved_key}", rank, factor) + + +def write_int8(vals, + dir, + base_key, + split_dim, + i, + factor, + is_qkv=False, + multi_query_mode=False): + saved_keys_once = [ + "scale_x_orig_quant", "scale_w_quant_orig", "scale_y_accum_quant", + "scale_y_quant_orig" + ] + + if is_qkv and multi_query_mode: + assert split_dim == -1 + local_dim = vals["weight.int8"].shape[0] + head_size = (vals["weight.int8"].shape[1] - local_dim) // 2 + + save_multi_query_mode_qkv_int8(vals["weight.int8"], dir, base_key, + "weight.int8", factor, i, local_dim, + head_size) + save_multi_query_mode_qkv_int8(vals["weight.int8.col"], dir, base_key, + "weight.int8.col", factor, i, local_dim, + head_size) + save_multi_query_mode_qkv_int8(vals["scale_w_quant_orig.col"], dir, + base_key, "scale_w_quant_orig.col", + factor, i, local_dim, head_size) + save_multi_query_mode_qkv_int8(vals["scale_y_accum_quant.col"], dir, + base_key, "scale_y_accum_quant.col", + factor, i, local_dim, head_size) + save_multi_query_mode_qkv_int8(vals["scale_w_quant_orig"], dir, + base_key, "scale_w_quant_orig", factor, + i, local_dim, head_size) + save_multi_query_mode_qkv_int8(vals["scale_y_accum_quant"], dir, + base_key, "scale_y_accum_quant", factor, + i, local_dim, head_size) + saved_keys_once = ["scale_x_orig_quant", "scale_y_quant_orig"] + else: + save_split(np.split(vals["weight.int8"], factor, axis=split_dim), dir, + f"{base_key}.weight.int8", i, factor) + save_split(np.split(vals["weight.int8.col"], factor, axis=split_dim), + dir, f"{base_key}.weight.int8.col", i, factor) + + if split_dim == -1: + save_split( + np.split(vals["scale_w_quant_orig.col"], factor, + axis=split_dim), dir, + f"{base_key}.scale_w_quant_orig.col", i, factor) + save_split( + np.split(vals["scale_y_accum_quant.col"], + factor, + axis=split_dim), dir, + f"{base_key}.scale_y_accum_quant.col", i, factor) + if is_qkv: + save_split( + np.split(vals["scale_y_accum_quant"], + factor, + axis=split_dim), dir, + f"{base_key}.scale_y_accum_quant", i, factor) + save_split( + np.split(vals["scale_w_quant_orig"], factor, + axis=split_dim), dir, + f"{base_key}.scale_w_quant_orig", i, factor) + saved_keys_once = ["scale_x_orig_quant", "scale_y_quant_orig"] + else: + saved_keys_once += [ + "scale_w_quant_orig.col", "scale_y_accum_quant.col" + ] + + if i == 0: + for save_key in saved_keys_once: + save_val(vals[save_key], dir, f"{base_key}.{save_key}") + + +def str_to_np_dtype(type_str): + convert_dict = { + "fp32": np.float32, + "fp16": np.float16, + } + dtype = convert_dict.get(type_str) + if dtype is None: + raise ValueError(f"{type_str} is an invalid storage type") + return dtype + + +def split_and_save_weight(i, saved_dir, factor, key, val, act_range, config): + # The split_factor indicates the number of ranks to implement + # distributed GEMMs. For Tensor Parallelism, each rank/GPU works + # on split_hidden_dim // split_factor channels. + + int8_outputs = config.get("int8_outputs", None) + multi_query_mode = config.get("multi_query_mode", False) + local_dim = config.get("local_dim", None) + + save_int8 = int8_outputs == "all" or int8_outputs == "kv_cache_only" + + if "input_layernorm.weight" in key or "input_layernorm.bias" in key or \ + "attention.dense.bias" in key or "post_layernorm.weight" in key or \ + "post_attention_layernorm.bias" in key or "mlp.dense_4h_to_h.bias" in key or \ + "final_layernorm.weight" in key or "final_layernorm.bias" in key: + + # shared weights, only need to convert the weights of rank 0 + if i == 0: + save_val(val, saved_dir, key) + + elif "attention.dense.weight" in key or "mlp.proj.weight" in key: + split_dim = 0 + split_vals = np.split(val, factor, axis=split_dim) + save_split(split_vals, saved_dir, key, i, factor) + if act_range is not None and int8_outputs == "all": + base_key = key.replace(".weight", "") + vals_i8 = generate_int8(val, act_range) + write_int8(vals_i8, saved_dir, base_key, split_dim, i, factor) + + elif "mlp.fc.weight" in key or "mlp.gate.weight" in key: + split_dim = -1 + split_vals = np.split(val, factor, axis=split_dim) + save_split(split_vals, saved_dir, key, i, factor) + if act_range is not None and int8_outputs == "all": + base_key = key.replace(".weight", "") + vals_i8 = generate_int8(val, act_range) + write_int8(vals_i8, saved_dir, base_key, split_dim, i, factor) + + elif "attention.query_key_value.weight" in key: + hidden_dim = val.shape[0] + if local_dim is None: + local_dim = val.shape[-1] // 3 + if multi_query_mode: + head_size = (val.shape[-1] - local_dim) // 2 + val = val.reshape(hidden_dim, local_dim + 2 * head_size) + w_q, w_k, w_v = np.split(val, [local_dim, local_dim + head_size], + axis=-1) + w_q_split = np.split(w_q, factor, axis=-1) + w_k_split = np.split(w_k, factor, axis=-1) + w_v_split = np.split(w_v, factor, axis=-1) + split_vals = [ + np.concatenate((w_q_split[ii], w_k_split[ii], w_v_split[ii]), + axis=-1) for ii in range(factor) + ] + split_dim = -1 + else: + val = val.reshape(hidden_dim, 3, local_dim) + split_dim = -1 + split_vals = np.split(val, factor, axis=split_dim) + save_split(split_vals, saved_dir, key, i, factor) + if save_int8: + base_key = key.replace(".weight", "") + vals_i8 = generate_int8(val, + act_range, + is_qkv=True, + multi_query_mode=multi_query_mode) + write_int8(vals_i8, + saved_dir, + base_key, + split_dim, + i, + factor, + is_qkv=True, + multi_query_mode=multi_query_mode) + + elif "attention.query_key_value.bias" in key: + if local_dim is None: + local_dim = val.shape[-1] // 3 + + val = val.reshape(3, local_dim) + split_vals = np.split(val, factor, axis=-1) + save_split(split_vals, saved_dir, key, i, factor) + + elif "attention.dense.smoother" in key or "mlp.proj.smoother" in key: + split_vals = np.split(val, factor, axis=0) + save_split(split_vals, saved_dir, key, i, factor) + + else: + print(f"[WARNING] {key} not handled by converter") diff --git a/examples/internlm/hf_internlm_convert.py b/examples/internlm/hf_internlm_convert.py new file mode 100644 index 0000000000..23f80c3102 --- /dev/null +++ b/examples/internlm/hf_internlm_convert.py @@ -0,0 +1,368 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +''' +Convert huggingface GPT model. Use https://huggingface.co/gpt2 as demo. +''' +import argparse +import configparser +import os +from pathlib import Path + +import torch +import torch.multiprocessing as multiprocessing +from convert import split_and_save_weight, str_to_np_dtype +from smoothquant import (capture_activation_range, smooth_gemm, + smooth_gemm_fc1_gate) +from tqdm import tqdm +from transformers import AutoModelForCausalLM, AutoTokenizer + + +def merge_qkv_scales(q_name, hf_model, scales, internlm_qkv_para): + layer_name_q = q_name.replace(".weight", "") + layer_name_k = layer_name_q.replace("q_proj", "k_proj") + layer_name_v = layer_name_q.replace("q_proj", "v_proj") + layer_name_qkv = layer_name_q.replace("q_proj", "qkv_proj") + + q = hf_model.state_dict()[layer_name_q + ".weight"] + k = hf_model.state_dict()[layer_name_k + ".weight"] + v = hf_model.state_dict()[layer_name_v + ".weight"] + + weight = torch.cat([q, k, v], dim=0) + + scales[layer_name_qkv]["x"] = scales[layer_name_q]["x"] + scales[layer_name_qkv]["w"] = weight.abs().max(dim=1)[0] + print(scales[layer_name_q]) + scales[layer_name_qkv]["y"] = torch.cat([ + scales[layer_name_q]["y"], scales[layer_name_k]["y"], + scales[layer_name_v]["y"] + ], + dim=0) + + internlm_qkv_para[layer_name_qkv] = weight.transpose(0, 1) + + +def merge_qkv_bias(q_name, hf_model, internlm_qkv_para={}): + layer_name_q = q_name.replace(".bias", "") + layer_name_k = layer_name_q.replace("q_proj", "k_proj") + layer_name_v = layer_name_q.replace("q_proj", "v_proj") + # layer_name_qkv = layer_name_q.replace("q_proj", "qkv_proj") + + q = hf_model.state_dict()[layer_name_q + ".bias"] + k = hf_model.state_dict()[layer_name_k + ".bias"] + v = hf_model.state_dict()[layer_name_v + ".bias"] + + bias = torch.cat([q, k, v], dim=0) + + return bias + + +@torch.no_grad() +def smooth_internlm_model(model, scales, alpha, internlm_qkv_para, + internlm_smoother): + # Smooth the activation and weights with smoother = $\diag{s}$ + for name, module in model.named_modules(): + if not module.__class__.__name__ == "InternLMDecoderLayer": + continue + # qkv_proj + layer_name_q = name + ".self_attn.q_proj" + layer_name_k = name + ".self_attn.k_proj" + layer_name_v = name + ".self_attn.v_proj" + layer_name_qkv = name + ".self_attn.qkv_proj" + + weight = torch.cat([ + module.self_attn.q_proj.weight, module.self_attn.k_proj.weight, + module.self_attn.v_proj.weight + ], + dim=0) + + smoother = smooth_gemm(weight, scales[layer_name_q]["x"], + module.input_layernorm.weight, None, alpha) + + scales[layer_name_qkv]["x"] = scales[layer_name_q]["x"] / smoother + scales[layer_name_qkv]["w"] = weight.abs().max(dim=1)[0] + scales[layer_name_qkv]["y"] = torch.cat([ + scales[layer_name_q]["y"], scales[layer_name_k]["y"], + scales[layer_name_v]["y"] + ], + dim=0) + + # see transpose_weights function + internlm_qkv_para[layer_name_qkv] = weight.transpose(0, 1) + + # ================================================================= + layer_name = name + ".self_attn.o_proj" + smoother = smooth_gemm(module.self_attn.o_proj.weight, + scales[layer_name]["x"], None, None, alpha) + internlm_smoother[layer_name] = smoother.float() + + scales[layer_name]["x"] = scales[layer_name]["x"] / smoother + scales[layer_name]["w"] = module.self_attn.o_proj.weight.abs().max( + dim=1)[0] + + # ================================================================== + fc1_layer_name = name + ".mlp.gate_proj" + gate_layer_name = name + ".mlp.up_proj" + + smoother = smooth_gemm_fc1_gate(module.mlp.gate_proj.weight, + module.mlp.up_proj.weight, + scales[fc1_layer_name]["x"], + module.post_attention_layernorm.weight, + None, alpha) + + scales[fc1_layer_name]["x"] = scales[fc1_layer_name]["x"] / smoother + scales[fc1_layer_name]["w"] = module.mlp.gate_proj.weight.abs().max( + dim=1)[0] + + scales[gate_layer_name]["x"] = scales[gate_layer_name]["x"] / smoother + scales[gate_layer_name]["w"] = module.mlp.up_proj.weight.abs().max( + dim=1)[0] + + # ================================================================== + layer_name = name + ".mlp.down_proj" + smoother = smooth_gemm(module.mlp.down_proj.weight, + scales[layer_name]["x"], None, None, alpha) + internlm_smoother[layer_name] = smoother.float() + scales[layer_name]["x"] = scales[layer_name]["x"] / smoother + scales[layer_name]["w"] = module.mlp.down_proj.weight.abs().max( + dim=1)[0] + + +def gpt_to_ft_name(orig_name): + global_ft_weights = { + "model.embed_tokens.weight": 'vocab_embedding.weight', + "model.norm.weight": 'ln_f.weight', + "lm_head.weight": 'lm_head.weight', + } + + if orig_name in global_ft_weights: + return global_ft_weights[orig_name] + + _, _, layer_id, *weight_name = orig_name.split(".") + + layer_id = int(layer_id) + weight_name = ".".join(weight_name) + + if weight_name == 'self_attn.q_proj.weight': + return f"layers.{layer_id}.attention.query_key_value.weight" + elif weight_name == 'self_attn.k_proj.weight' or weight_name == 'self_attn.v_proj.weight': + return f"layers.{layer_id}.attention.kv.weight" + if weight_name == 'self_attn.q_proj.bias': + return f"layers.{layer_id}.attention.query_key_value.bias" + elif weight_name == 'self_attn.k_proj.bias' or weight_name == 'self_attn.v_proj.bias': + return f"layers.{layer_id}.attention.kv.bias" + + per_layer_weights = { + "input_layernorm.weight": "input_layernorm.weight", + "self_attn.o_proj.weight": "attention.dense.weight", + "self_attn.o_proj.bias": "attention.dense.bias", + "mlp.gate_proj.weight": "mlp.fc.weight", + "mlp.down_proj.weight": "mlp.proj.weight", + "mlp.up_proj.weight": "mlp.gate.weight", + "post_attention_layernorm.weight": "post_layernorm.weight", + } + + return f"layers.{layer_id}.{per_layer_weights[weight_name]}" + + +# LLaMA uses nn.Linear for these following ops whose weight matrix is transposed compared to gpt2. +# In order to use the preprocess codes of gpt2, we transpose them firstly. +def transpose_weights(hf_name, param): + weight_to_transpose = ["o_proj", "gate_proj", "down_proj", "up_proj"] + if any([k in hf_name for k in weight_to_transpose]): + if len(param.shape) == 2: + param = param.transpose(0, 1) + return param + + +def hf_gpt_converter(args): + infer_tp = args.tensor_parallelism + saved_dir = Path(args.out_dir) / f"{infer_tp}-gpu" + saved_dir.mkdir(parents=True, exist_ok=True) + + model = AutoModelForCausalLM.from_pretrained(args.in_file, + device_map="auto", + trust_remote_code=True) + + act_range = {} + internlm_qkv_para = {} + # smoother for inputs of self_attn.o_proj and mlp.down_proj + internlm_smoother = {} + + if args.smoothquant is not None or args.calibrate_kv_cache: + os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get( + "TOKENIZERS_PARALLELISM", "false") + act_range = capture_activation_range( + model, + AutoTokenizer.from_pretrained(args.in_file, + padding_side='left', + trust_remote_code=True)) + if args.smoothquant is not None: + smooth_internlm_model(model, act_range, args.smoothquant, + internlm_qkv_para, internlm_smoother) + + config = configparser.ConfigParser() + config["internlm"] = {} + for key in vars(args): + config["internlm"][key] = f"{vars(args)[key]}" + for k, v in vars(model.config).items(): + config["internlm"][k] = f"{v}" + config["internlm"]["weight_data_type"] = args.storage_type + config["internlm"]["multi_query_mode"] = str(args.multi_query_mode) + with open(saved_dir / "config.ini", 'w') as configfile: + config.write(configfile) + + storage_type = str_to_np_dtype(args.storage_type) + + global_ft_weights = [ + 'vocab_embedding.weight', 'ln_f.weight', 'lm_head.weight' + ] + + int8_outputs = None + if args.calibrate_kv_cache: + int8_outputs = "kv_cache_only" + if args.smoothquant is not None: + int8_outputs = "all" + + starmap_args = [] + for name, param in model.named_parameters(): + if "weight" not in name and "bias" not in name: + continue + ft_name = gpt_to_ft_name(name) + + if name.replace(".weight", "") in internlm_smoother.keys(): + smoother = internlm_smoother[name.replace(".weight", "")] + smoother = smoother.detach().cpu().numpy() + starmap_args.append( + (0, saved_dir, infer_tp, + f"{ft_name}.smoother".replace(".weight", ""), smoother, None, { + "int8_outputs": int8_outputs, + "multi_query_mode": args.multi_query_mode, + "local_dim": None, + })) + + param = transpose_weights(name, param) + + param = param.detach().cpu().numpy().astype(storage_type) + + if ft_name in global_ft_weights: + param.tofile(saved_dir / f"{ft_name}.bin") + elif ft_name.split('.')[-2:] == ['query_key_value', 'bias']: + param = merge_qkv_bias(name, model) + param = param.cpu().numpy().astype(storage_type) + bias = (0, saved_dir, infer_tp, ft_name, param, None, { + "int8_outputs": int8_outputs, + "multi_query_mode": args.multi_query_mode, + "local_dim": None + }) + starmap_args.append(bias) + elif ft_name.split('.')[-2:] == ['query_key_value', 'weight']: + # Is there other ways to get local_dim? local_dim = hidden_size in internlm + local_dim = model.config.hidden_size if args.multi_query_mode else None + if args.smoothquant is None: + merge_qkv_scales(name, model, act_range, internlm_qkv_para) + qkv = (0, saved_dir, infer_tp, ft_name, + internlm_qkv_para.get( + name.replace(".weight", "").replace( + ".q_proj", + ".qkv_proj")).cpu().numpy().astype(storage_type), + act_range.get( + name.replace(".weight", + "").replace(".q_proj", ".qkv_proj")), { + "int8_outputs": int8_outputs, + "multi_query_mode": + args.multi_query_mode, + "local_dim": local_dim, + }) + starmap_args.append(qkv) + elif ft_name.split('.')[-2] == 'kv': + continue + else: + starmap_args.append((0, saved_dir, infer_tp, ft_name, param, + act_range.get(name.replace(".weight", "")), { + "int8_outputs": int8_outputs, + "multi_query_mode": args.multi_query_mode, + "local_dim": None, + })) + + starmap_args = tqdm(starmap_args, desc="saving weights") + if args.processes > 1: + with multiprocessing.Pool(args.processes) as pool: + pool.starmap(split_and_save_weight, starmap_args) + else: + # simpler for debug situations + for starmap_arg in starmap_args: + split_and_save_weight(*starmap_arg) + + +if __name__ == "__main__": + torch.multiprocessing.set_start_method("spawn") + + parser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--out-dir', + '-o', + type=str, + help='file name of output directory', + required=True) + parser.add_argument('--in-file', + '-i', + type=str, + help='file name of input checkpoint file', + required=True) + parser.add_argument('--tensor-parallelism', + '-tp', + type=int, + help='Requested tensor parallelism for inference', + default=1) + parser.add_argument( + "--processes", + "-p", + type=int, + help="How many processes to spawn for conversion (default: 8)", + default=8) + parser.add_argument( + "--calibrate-kv-cache", + "-kv", + action="store_true", + help= + "Generate scaling factors for KV cache. Used for storing KV cache in int8." + ) + parser.add_argument( + "--smoothquant", + "-sq", + type=float, + default=None, + help="Set the α parameter (see https://arxiv.org/pdf/2211.10438.pdf)" + " to Smoothquant the model, and output int8 weights." + " A good first try is 0.5. Must be in [0, 1]") + parser.add_argument("--storage-type", + "-t", + type=str, + default="fp32", + choices=["fp32", "fp16"]) + parser.add_argument("--multi-query-mode", + action="store_true", + help="Use multi-query-attention.") + + args = parser.parse_args() + print("\n=============== Argument ===============") + for key in vars(args): + print("{}: {}".format(key, vars(args)[key])) + print("========================================") + + assert (args.calibrate_kv_cache or args.smoothquant), \ + "Either INT8 kv cache or SmoothQuant must be enabled for this script. Otherwise you can directly build engines from HuggingFace checkpoints, no need to do this FT-format conversion. " + + hf_gpt_converter(args) diff --git a/examples/internlm/quantize.py b/examples/internlm/quantize.py new file mode 100644 index 0000000000..3dc088ee92 --- /dev/null +++ b/examples/internlm/quantize.py @@ -0,0 +1,135 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Adapted from examples/quantization/hf_ptq.py +""" + +import argparse +import random + +import numpy as np +import torch +from datasets import load_dataset +from torch.utils.data import DataLoader +from transformers import AutoModelForCausalLM, AutoTokenizer + +from tensorrt_llm._utils import str_dtype_to_torch +from tensorrt_llm.logger import logger +from tensorrt_llm.models.quantized.ammo import quantize_and_export + + +def get_calib_dataloader(data="cnn_dailymail", + tokenizer=None, + batch_size=1, + calib_size=512, + block_size=512): + print("Loading calibration dataset") + if data == "pileval": + dataset = load_dataset( + "json", + data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", + split="train") + dataset = dataset["text"][:calib_size] + elif data == "cnn_dailymail": + dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train") + dataset = dataset["article"][:calib_size] + else: + raise NotImplementedError + + batch_encoded = tokenizer.batch_encode_plus(dataset, + return_tensors="pt", + padding=True, + max_length=block_size) + batch_encoded = batch_encoded["input_ids"] + batch_encoded = batch_encoded.cuda() + + calib_dataloader = DataLoader(batch_encoded, + batch_size=batch_size, + shuffle=False) + + return calib_dataloader + + +def get_tokenizer(ckpt_path, **kwargs): + logger.info(f"Loading tokenizer from {ckpt_path}") + tokenizer = AutoTokenizer.from_pretrained(ckpt_path, + padding_side="left", + **kwargs) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + return tokenizer + + +def get_model(ckpt_path, dtype="float16"): + logger.info(f"Loading model from {ckpt_path}") + torch_dtype = str_dtype_to_torch(dtype) + model = AutoModelForCausalLM.from_pretrained( + ckpt_path, + device_map="auto", + trust_remote_code=True, + torch_dtype=torch_dtype, + ) + model.eval() + model = model.to(memory_format=torch.channels_last) + return model + + +def get_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--model_dir", + type=str, + required=True, + help="Directory of a HF model checkpoint") + parser.add_argument("--dtype", help="Model data type.", default="float16") + parser.add_argument( + "--qformat", + type=str, + choices=['fp8', 'int4_awq'], + default='fp8', + help='Quantization format. Currently only fp8 is supported. ' + 'For int8 smoothquant, use smoothquant.py instead. ') + parser.add_argument("--calib_size", + type=int, + default=512, + help="Number of samples for calibration.") + parser.add_argument("--export_path", default="exported_model") + parser.add_argument('--seed', type=int, default=None, help='Random seed') + args = parser.parse_args() + return args + + +def main(): + if not torch.cuda.is_available(): + raise EnvironmentError("GPU is required for inference.") + + args = get_args() + + if args.seed is not None: + random.seed(args.seed) + np.random.seed(args.seed) + + tokenizer = get_tokenizer(args.model_dir) + model = get_model(args.model_dir, args.dtype) + + calib_dataloader = get_calib_dataloader(tokenizer=tokenizer, + calib_size=args.calib_size) + model = quantize_and_export(model, + qformat=args.qformat, + calib_dataloader=calib_dataloader, + export_path=args.export_path) + + +if __name__ == "__main__": + main() diff --git a/examples/internlm/requirements.txt b/examples/internlm/requirements.txt new file mode 100644 index 0000000000..926de5f086 --- /dev/null +++ b/examples/internlm/requirements.txt @@ -0,0 +1,3 @@ +datasets==2.14.5 +rouge_score~=0.1.2 +sentencepiece~=0.1.99 diff --git a/examples/internlm/run.py b/examples/internlm/run.py new file mode 100644 index 0000000000..1976027d37 --- /dev/null +++ b/examples/internlm/run.py @@ -0,0 +1,275 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import csv +import json +from pathlib import Path + +import numpy as np +import torch +from transformers import AutoTokenizer + +import tensorrt_llm +from tensorrt_llm.quantization import QuantMode +from tensorrt_llm.runtime import ModelConfig, SamplingConfig + +from build import get_engine_name # isort:skip + +EOS_TOKEN = 2 +PAD_TOKEN = 2 + + +def throttle_generator(generator, stream_interval): + for i, out in enumerate(generator): + if not i % stream_interval: + yield out + + if i % stream_interval: + yield out + + +def read_config(config_path: Path): + with open(config_path, 'r') as f: + config = json.load(f) + use_gpt_attention_plugin = config['plugin_config']['gpt_attention_plugin'] + remove_input_padding = config['plugin_config']['remove_input_padding'] + dtype = config['builder_config']['precision'] + tp_size = config['builder_config']['tensor_parallel'] + pp_size = config['builder_config']['pipeline_parallel'] + world_size = tp_size * pp_size + assert world_size == tensorrt_llm.mpi_world_size(), \ + f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})' + num_heads = config['builder_config']['num_heads'] // tp_size + hidden_size = config['builder_config']['hidden_size'] // tp_size + vocab_size = config['builder_config']['vocab_size'] + num_layers = config['builder_config']['num_layers'] + num_kv_heads = config['builder_config'].get('num_kv_heads', num_heads) + paged_kv_cache = config['plugin_config']['paged_kv_cache'] + tokens_per_block = config['plugin_config']['tokens_per_block'] + quant_mode = QuantMode(config['builder_config']['quant_mode']) + if config['builder_config'].get('multi_query_mode', False): + tensorrt_llm.logger.warning( + "`multi_query_mode` config is deprecated. Please rebuild the engine." + ) + num_kv_heads = 1 + num_kv_heads = (num_kv_heads + tp_size - 1) // tp_size + use_custom_all_reduce = config['plugin_config'].get('use_custom_all_reduce', + False) + + model_config = ModelConfig(num_heads=num_heads, + num_kv_heads=num_kv_heads, + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + gpt_attention_plugin=use_gpt_attention_plugin, + paged_kv_cache=paged_kv_cache, + tokens_per_block=tokens_per_block, + remove_input_padding=remove_input_padding, + dtype=dtype, + quant_mode=quant_mode, + use_custom_all_reduce=use_custom_all_reduce) + + return model_config, tp_size, pp_size, dtype + + +def parse_input(input_text: str, input_file: str, tokenizer, end_id: int, + remove_input_padding: bool): + input_tokens = [] + if input_file is None: + input_text = f'<|User|>:{input_text}\n<|Bot|>:' + input_tokens.append( + tokenizer.encode(input_text, add_special_tokens=False)) + print(f'Input: \"{input_text}\"') + print(f'Input: {input_tokens[0]}') + else: + if input_file.endswith('.csv'): + with open(input_file, 'r') as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + for line in csv_reader: + input_tokens.append(np.array(line, dtype='int32')) + elif input_file.endswith('.npy'): + inputs = np.load(input_file) + for row in inputs: + row = row[row != end_id] + input_tokens.append(row) + else: + print('Input file format not supported.') + raise SystemExit + + input_ids = None + input_lengths = torch.tensor([len(x) for x in input_tokens], + dtype=torch.int32, + device='cuda') + if remove_input_padding: + input_ids = np.concatenate(input_tokens) + input_ids = torch.tensor(input_ids, dtype=torch.int32, + device='cuda').unsqueeze(0) + else: + input_ids = torch.nested.to_padded_tensor( + torch.nested.nested_tensor(input_tokens, dtype=torch.int32), + end_id).cuda() + + return input_ids, input_lengths + + +def print_output(output_ids, input_lengths, max_output_len, tokenizer, + output_csv, output_npy): + num_beams = output_ids.size(1) + if output_csv is None and output_npy is None: + for b in range(input_lengths.size(0)): + inputs = output_ids[b][0][:input_lengths[b]].tolist() + input_text = tokenizer.decode(inputs) + print(f'Input: \"{input_text}\"') + for beam in range(num_beams): + output_begin = input_lengths[b] + output_end = input_lengths[b] + max_output_len + outputs = output_ids[b][beam][output_begin:output_end].tolist() + output_text = tokenizer.decode(outputs) + print(f'Output ids: {outputs}') + print(f'Output: \"{output_text}\"') + + output_ids = output_ids.reshape((-1, output_ids.size(2))) + + if output_csv is not None: + output_file = Path(output_csv) + output_file.parent.mkdir(exist_ok=True, parents=True) + outputs = output_ids.tolist() + with open(output_file, 'w') as csv_file: + writer = csv.writer(csv_file, delimiter=',') + writer.writerows(outputs) + + if output_npy is not None: + output_file = Path(output_npy) + output_file.parent.mkdir(exist_ok=True, parents=True) + outputs = np.array(output_ids.cpu().contiguous(), dtype='int32') + np.save(output_file, outputs) + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--max_output_len', type=int, required=True) + parser.add_argument('--log_level', type=str, default='error') + parser.add_argument('--engine_dir', type=str, default='internlm_outputs') + parser.add_argument('--tokenizer_dir', + type=str, + default=".", + help="Directory containing the tokenizer.model.") + parser.add_argument('--input_text', + type=str, + default='Tell me about yourself.') + parser.add_argument( + '--input_tokens', + dest='input_file', + type=str, + help= + 'CSV or Numpy file containing tokenized input. Alternative to text input.', + default=None) + parser.add_argument('--output_csv', + type=str, + help='CSV file where the tokenized output is stored.', + default=None) + parser.add_argument('--output_npy', + type=str, + help='Numpy file where the tokenized output is stored.', + default=None) + parser.add_argument('--num_beams', + type=int, + help="Use beam search if num_beams >1", + default=1) + parser.add_argument('--streaming', default=False, action='store_true') + parser.add_argument('--streaming_interval', + type=int, + help="How often to return tokens when streaming.", + default=5) + return parser.parse_args() + + +def generate( + max_output_len: int, + log_level: str = 'error', + engine_dir: str = 'internlm_outputs', + input_text: str = 'Tell me about yourself.', + input_file: str = None, + output_csv: str = None, + output_npy: str = None, + tokenizer_dir: str = None, + num_beams: int = 1, + streaming: bool = False, + streaming_interval: int = 5, +): + tensorrt_llm.logger.set_level(log_level) + + engine_dir = Path(engine_dir) + config_path = engine_dir / 'config.json' + model_config, tp_size, pp_size, dtype = read_config(config_path) + world_size = tp_size * pp_size + + runtime_rank = tensorrt_llm.mpi_rank() + runtime_mapping = tensorrt_llm.Mapping(world_size, + runtime_rank, + tp_size=tp_size, + pp_size=pp_size) + torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node) + + tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, + legacy=False, + trust_remote_code=True) + + sampling_config = SamplingConfig(end_id=EOS_TOKEN, + pad_id=PAD_TOKEN, + num_beams=num_beams) + + engine_name = get_engine_name('internlm', dtype, tp_size, pp_size, + runtime_rank) + serialize_path = engine_dir / engine_name + with open(serialize_path, 'rb') as f: + engine_buffer = f.read() + decoder = tensorrt_llm.runtime.GenerationSession(model_config, + engine_buffer, + runtime_mapping, + debug_mode=False, + debug_tensors_to_save=None) + if runtime_rank == 0: + print(f"Running the {dtype} engine ...") + + input_ids, input_lengths = parse_input(input_text, input_file, tokenizer, + EOS_TOKEN, + model_config.remove_input_padding) + + max_input_length = torch.max(input_lengths).item() + decoder.setup(input_lengths.size(0), max_input_length, max_output_len, + num_beams) + + output_gen_ids = decoder.decode(input_ids, + input_lengths, + sampling_config, + streaming=streaming) + torch.cuda.synchronize() + if streaming: + for output_ids in throttle_generator(output_gen_ids, + streaming_interval): + if runtime_rank == 0: + print_output(output_ids, input_lengths, max_output_len, + tokenizer, output_csv, output_npy) + else: + output_ids = output_gen_ids + if runtime_rank == 0: + print_output(output_ids, input_lengths, max_output_len, tokenizer, + output_csv, output_npy) + + +if __name__ == '__main__': + args = parse_arguments() + generate(**vars(args)) diff --git a/examples/internlm/smoothquant.py b/examples/internlm/smoothquant.py new file mode 100644 index 0000000000..4e4145cb4e --- /dev/null +++ b/examples/internlm/smoothquant.py @@ -0,0 +1,204 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +''' +Utilities for SmoothQuant models +''' + +import copy +import functools +from collections import defaultdict + +import torch +import torch.nn as nn +from tqdm import tqdm +from transformers.pytorch_utils import Conv1D + + +@torch.no_grad() +def apply_smoothing(scales, + gemm_weights, + layernorm_weights=None, + layernorm_bias=None, + dtype=torch.float32, + layernorm_1p=False): + if not isinstance(gemm_weights, list): + gemm_weights = [gemm_weights] + + if layernorm_weights is not None: + assert layernorm_weights.numel() == scales.numel() + layernorm_weights.div_(scales).to(dtype) + if layernorm_bias is not None: + assert layernorm_bias.numel() == scales.numel() + layernorm_bias.div_(scales).to(dtype) + if layernorm_1p: + layernorm_weights += (1 / scales) - 1 + + for gemm in gemm_weights: + gemm.mul_(scales.view(1, -1)).to(dtype) + + +@torch.no_grad() +def smooth_gemm(gemm_weights, + act_scales, + layernorm_weights=None, + layernorm_bias=None, + alpha=0.5, + weight_scales=None): + if not isinstance(gemm_weights, list): + gemm_weights = [gemm_weights] + orig_dtype = gemm_weights[0].dtype + + for gemm in gemm_weights: + # gemm_weights are expected to be transposed + assert gemm.shape[1] == act_scales.numel() + + if weight_scales is None: + weight_scales = torch.cat( + [gemm.abs().max(dim=0, keepdim=True)[0] for gemm in gemm_weights], + dim=0) + weight_scales = weight_scales.max(dim=0)[0] + weight_scales.to(float).clamp(min=1e-5) + scales = (act_scales.to(gemm_weights[0].device).to(float).pow(alpha) / + weight_scales.pow(1 - alpha)).clamp(min=1e-5) + + apply_smoothing(scales, gemm_weights, layernorm_weights, layernorm_bias, + orig_dtype) + + return scales + + +@torch.no_grad() +def smooth_gemm_fc1_gate(fc1_weights, + gate_weights, + act_scales, + layernorm_weights=None, + layernorm_bias=None, + alpha=0.5, + weight_scales=None): + gemm_weights = [] + if not isinstance(fc1_weights, list): + fc1_weights = [fc1_weights] + if not isinstance(gate_weights, list): + gate_weights = [gate_weights] + + for i in range(len(fc1_weights)): + gemm_weight = torch.cat([fc1_weights[i], gate_weights[i]], dim=0) + gemm_weights.append(gemm_weight) + + orig_dtype = gemm_weights[0].dtype + + for gemm in gemm_weights: + # gemm_weights are expected to be transposed + assert gemm.shape[1] == act_scales.numel() + + if weight_scales is None: + weight_scales = torch.cat( + [gemm.abs().max(dim=0, keepdim=True)[0] for gemm in gemm_weights], + dim=0) + weight_scales = weight_scales.max(dim=0)[0] + weight_scales.to(float).clamp(min=1e-5) + scales = (act_scales.to(gemm_weights[0].device).to(float).pow(alpha) / + weight_scales.pow(1 - alpha)).clamp(min=1e-5) + + apply_smoothing(scales, fc1_weights + gate_weights, layernorm_weights, + layernorm_bias, orig_dtype) + + return scales + + +@torch.no_grad() +def smooth_ln_fcs(ln, fcs, act_scales, alpha=0.5): + if not isinstance(fcs, list): + fcs = [fcs] + for fc in fcs: + assert isinstance(fc, nn.Linear) + assert ln.weight.numel() == fc.in_features == act_scales.numel() + + device, dtype = fcs[0].weight.device, fcs[0].weight.dtype + act_scales = act_scales.to(device=device, dtype=dtype) + weight_scales = torch.cat( + [fc.weight.abs().max(dim=0, keepdim=True)[0] for fc in fcs], dim=0) + weight_scales = weight_scales.max(dim=0)[0].clamp(min=1e-5) + + scales = (act_scales.pow(alpha) / + weight_scales.pow(1 - alpha)).clamp(min=1e-5).to(device).to(dtype) + + if ln is not None: + ln.weight.div_(scales) + ln.bias.div_(scales) + + for fc in fcs: + fc.weight.mul_(scales.view(1, -1)) + return scales + + +@torch.no_grad() +def capture_activation_range(model, tokenizer, num_samples=512, seq_len=512): + model.eval() + next(model.parameters()).device + act_scales = defaultdict(lambda: {"x": None, "y": None, "w": None}) + + test_token_num = 923 + tokenizer.pad_token = tokenizer.eos_token + + def stat_tensor(name, tensor, act_scales, key): + hidden_dim = tensor.shape[-1] + tensor = tensor.view(-1, hidden_dim).abs().detach() + comming_max = torch.max(tensor, dim=0)[0].float() + + if act_scales[name][key] is None: + act_scales[name][key] = comming_max + else: + act_scales[name][key] = torch.max(act_scales[name][key], + comming_max) + + def stat_input_hook(m, x, y, name): + if isinstance(x, tuple): + x = x[0] + stat_tensor(name, x, act_scales, "x") + stat_tensor(name, y, act_scales, "y") + + if act_scales[name]["w"] is None: + act_scales[name]["w"] = m.weight.abs().clip(1e-8, + None).max(dim=1)[0] + + hooks = [] + for name, m in model.named_modules(): + if isinstance(m, nn.Linear) or isinstance(m, Conv1D): + hooks.append( + m.register_forward_hook( + functools.partial(stat_input_hook, name=name))) + + from datasets import load_dataset + dataset_cnn = load_dataset("ccdv/cnn_dailymail", '3.0.0') + + for i in tqdm(range(num_samples), desc="calibrating model"): + datapoint = dataset_cnn['train'][i:i + 1] + line = copy.copy(datapoint['article']) + line[0] = line[0] + ' TL;DR: ' + line[0] = line[0].strip() + line[0] = line[0].replace(" n't", "n't") + line_encoded = tokenizer(line, + return_tensors="pt", + padding=True, + truncation=True)["input_ids"].type(torch.int64) + line_encoded = line_encoded[:, -test_token_num:] + line_encoded = line_encoded.cuda() + model(line_encoded) + + for h in hooks: + h.remove() + + return act_scales diff --git a/examples/internlm/summarize.py b/examples/internlm/summarize.py new file mode 100644 index 0000000000..2199014aaf --- /dev/null +++ b/examples/internlm/summarize.py @@ -0,0 +1,414 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import copy +import json +import os + +import numpy as np +import torch +from datasets import load_dataset, load_metric +from transformers import AutoModelForCausalLM, AutoTokenizer + +import tensorrt_llm +import tensorrt_llm.profiler as profiler +from tensorrt_llm.logger import logger +from tensorrt_llm.quantization import QuantMode + +from build import get_engine_name # isort:skip + + +def TRTInternLM(args, config): + dtype = config['builder_config']['precision'] + tp_size = config['builder_config']['tensor_parallel'] + pp_size = config['builder_config']['pipeline_parallel'] + world_size = tp_size * pp_size + + assert world_size == tensorrt_llm.mpi_world_size(), \ + f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})' + + num_heads = config['builder_config']['num_heads'] // tp_size + hidden_size = config['builder_config']['hidden_size'] // tp_size + vocab_size = config['builder_config']['vocab_size'] + num_layers = config['builder_config']['num_layers'] + use_gpt_attention_plugin = bool( + config['plugin_config']['gpt_attention_plugin']) + remove_input_padding = config['plugin_config']['remove_input_padding'] + num_kv_heads = config['builder_config'].get('num_kv_heads', num_heads) + paged_kv_cache = config['plugin_config']['paged_kv_cache'] + tokens_per_block = config['plugin_config']['tokens_per_block'] + use_custom_all_reduce = config['plugin_config'].get('use_custom_all_reduce', + False) + + quant_mode = QuantMode(config['builder_config']['quant_mode']) + if config['builder_config'].get('multi_query_mode', False): + tensorrt_llm.logger.warning( + "`multi_query_mode` config is deprecated. Please rebuild the engine." + ) + num_kv_heads = 1 + num_kv_heads = (num_kv_heads + tp_size - 1) // tp_size + + model_config = tensorrt_llm.runtime.ModelConfig( + vocab_size=vocab_size, + num_layers=num_layers, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + hidden_size=hidden_size, + paged_kv_cache=paged_kv_cache, + tokens_per_block=tokens_per_block, + gpt_attention_plugin=use_gpt_attention_plugin, + remove_input_padding=remove_input_padding, + use_custom_all_reduce=use_custom_all_reduce, + dtype=dtype, + quant_mode=quant_mode) + + runtime_rank = tensorrt_llm.mpi_rank() + runtime_mapping = tensorrt_llm.Mapping(world_size, + runtime_rank, + tp_size=tp_size, + pp_size=pp_size) + torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node) + + engine_name = get_engine_name('internlm', dtype, tp_size, pp_size, + runtime_rank) + serialize_path = os.path.join(args.engine_dir, engine_name) + + tensorrt_llm.logger.set_level(args.log_level) + + profiler.start('load tensorrt_llm engine') + with open(serialize_path, 'rb') as f: + engine_buffer = f.read() + decoder = tensorrt_llm.runtime.GenerationSession(model_config, + engine_buffer, + runtime_mapping) + profiler.stop('load tensorrt_llm engine') + tensorrt_llm.logger.info( + f'Load engine takes: {profiler.elapsed_time_in_sec("load tensorrt_llm engine")} sec' + ) + return decoder + + +def main(args): + runtime_rank = tensorrt_llm.mpi_rank() + logger.set_level(args.log_level) + + test_hf = args.test_hf and runtime_rank == 0 # only run hf on rank 0 + test_trt_llm = args.test_trt_llm + hf_model_location = args.hf_model_location + profiler.start('load tokenizer') + tokenizer = AutoTokenizer.from_pretrained(hf_model_location, + legacy=False, + padding_side='left', + trust_remote_code=True) + profiler.stop('load tokenizer') + tensorrt_llm.logger.info( + f'Load tokenizer takes: {profiler.elapsed_time_in_sec("load tokenizer")} sec' + ) + tokenizer.pad_token = tokenizer.eos_token + + dataset_cnn = load_dataset("ccdv/cnn_dailymail", + '3.0.0', + cache_dir=args.dataset_path) + + max_batch_size = args.batch_size + + # runtime parameters + # repetition_penalty = 1 + top_k = args.top_k + output_len = 100 + test_token_num = 923 + # top_p = 0.0 + # random_seed = 5 + temperature = 1 + num_beams = args.num_beams + + pad_id = tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0] + end_id = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False)[0] + + if test_trt_llm: + config_path = os.path.join(args.engine_dir, 'config.json') + with open(config_path, 'r') as f: + config = json.load(f) + + tensorrt_llm_internlm = TRTInternLM(args, config) + + if test_hf: + profiler.start('load HF model') + model = AutoModelForCausalLM.from_pretrained(hf_model_location, + trust_remote_code=True) + profiler.stop('load HF model') + tensorrt_llm.logger.info( + f'Load HF model takes: {profiler.elapsed_time_in_sec("load HF model")} sec' + ) + if args.data_type == 'fp16': + model.half() + elif args.data_type == 'fp32': + model = model.float() + elif args.data_type == 'bf16': + model = model.to(dtype=torch.bfloat16) + # else use dtype in hf config, which is by default + model.cuda() + + def summarize_tensorrt_llm(datapoint): + batch_size = len(datapoint['article']) + + line = copy.copy(datapoint['article']) + line_encoded = [] + input_lengths = [] + for i in range(batch_size): + line[i] = line[i] + ' TL;DR: ' + + line[i] = line[i].strip() + line[i] = line[i].replace(" n't", "n't") + + input_id = tokenizer.encode(line[i], + return_tensors='pt').type(torch.int32) + input_id = input_id[:, -test_token_num:] + + line_encoded.append(input_id) + input_lengths.append(input_id.shape[-1]) + + # do padding, should move outside the profiling to prevent the overhead + max_length = max(input_lengths) + if tensorrt_llm_internlm.remove_input_padding: + line_encoded = [ + torch.as_tensor(t, dtype=torch.int32, device='cuda') + for t in line_encoded + ] + else: + # do padding, should move outside the profiling to prevent the overhead + for i in range(batch_size): + pad_size = max_length - input_lengths[i] + + pad = torch.ones([1, pad_size]).type(torch.int32) * pad_id + line_encoded[i] = torch.cat( + [torch.tensor(line_encoded[i], dtype=torch.int32), pad], + axis=-1) + + line_encoded = torch.cat(line_encoded, axis=0).cuda() + input_lengths = torch.tensor(input_lengths, + dtype=torch.int32).cuda() + + sampling_config = tensorrt_llm.runtime.SamplingConfig( + end_id=end_id, pad_id=pad_id, top_k=top_k, num_beams=num_beams) + + with torch.no_grad(): + tensorrt_llm_internlm.setup(batch_size, + max_context_length=max_length, + max_new_tokens=output_len, + beam_width=num_beams) + + if tensorrt_llm_internlm.remove_input_padding: + output_ids = tensorrt_llm_internlm.decode_batch( + line_encoded, sampling_config) + else: + output_ids = tensorrt_llm_internlm.decode( + line_encoded, + input_lengths, + sampling_config, + ) + + torch.cuda.synchronize() + + # Extract a list of tensors of shape beam_width x output_ids. + if tensorrt_llm_internlm.mapping.is_first_pp_rank(): + output_beams_list = [ + tokenizer.batch_decode(output_ids[batch_idx, :, + input_lengths[batch_idx]:], + skip_special_tokens=True) + for batch_idx in range(batch_size) + ] + return output_beams_list, output_ids[:, :, max_length:].tolist() + return [], [] + + def summarize_hf(datapoint): + batch_size = len(datapoint['article']) + if batch_size > 1: + logger.warning( + f"HF does not support batch_size > 1 to verify correctness due to padding. Current batch size is {batch_size}" + ) + + line = copy.copy(datapoint['article']) + for i in range(batch_size): + line[i] = line[i] + ' TL;DR: ' + + line[i] = line[i].strip() + line[i] = line[i].replace(" n't", "n't") + + line_encoded = tokenizer(line, + return_tensors='pt', + padding=True, + truncation=True)["input_ids"].type(torch.int64) + + line_encoded = line_encoded[:, -test_token_num:] + line_encoded = line_encoded.cuda() + + with torch.no_grad(): + output = model.generate(line_encoded, + max_length=len(line_encoded[0]) + + output_len, + top_k=top_k, + temperature=temperature, + eos_token_id=tokenizer.eos_token_id, + pad_token_id=tokenizer.pad_token_id, + num_beams=num_beams, + num_return_sequences=num_beams, + early_stopping=True) + + tokens_list = output[:, len(line_encoded[0]):].tolist() + output = output.reshape([batch_size, num_beams, -1]) + output_lines_list = [ + tokenizer.batch_decode(output[:, i, len(line_encoded[0]):], + skip_special_tokens=True) + for i in range(num_beams) + ] + + return output_lines_list, tokens_list + + if test_trt_llm: + datapoint = dataset_cnn['test'][0:1] + summary, _ = summarize_tensorrt_llm(datapoint) + if runtime_rank == 0: + logger.info( + "---------------------------------------------------------") + logger.info("TensorRT-LLM Generated : ") + logger.info(f" Article : {datapoint['article']}") + logger.info(f"\n Highlights : {datapoint['highlights']}") + logger.info(f"\n Summary : {summary}") + logger.info( + "---------------------------------------------------------") + + if test_hf: + datapoint = dataset_cnn['test'][0:1] + summary, _ = summarize_hf(datapoint) + logger.info("---------------------------------------------------------") + logger.info("HF Generated : ") + logger.info(f" Article : {datapoint['article']}") + logger.info(f"\n Highlights : {datapoint['highlights']}") + logger.info(f"\n Summary : {summary}") + logger.info("---------------------------------------------------------") + + metric_tensorrt_llm = [load_metric("rouge") for _ in range(num_beams)] + metric_hf = [load_metric("rouge") for _ in range(num_beams)] + for i in range(num_beams): + metric_tensorrt_llm[i].seed = 0 + metric_hf[i].seed = 0 + + ite_count = 0 + data_point_idx = 0 + while (data_point_idx < len(dataset_cnn['test'])) and (ite_count < + args.max_ite): + if runtime_rank == 0: + logger.debug( + f"run data_point {data_point_idx} ~ {data_point_idx + max_batch_size}" + ) + datapoint = dataset_cnn['test'][data_point_idx:(data_point_idx + + max_batch_size)] + + if test_trt_llm: + profiler.start('tensorrt_llm') + summary_tensorrt_llm, tokens_tensorrt_llm = summarize_tensorrt_llm( + datapoint) + profiler.stop('tensorrt_llm') + + if test_hf: + profiler.start('hf') + summary_hf, tokens_hf = summarize_hf(datapoint) + profiler.stop('hf') + + if runtime_rank == 0: + if test_trt_llm: + for batch_idx in range(len(summary_tensorrt_llm)): + for beam_idx in range(num_beams): + metric_tensorrt_llm[beam_idx].add_batch( + predictions=[ + summary_tensorrt_llm[batch_idx][beam_idx] + ], + references=[datapoint['highlights'][batch_idx]]) + if test_hf: + for beam_idx in range(num_beams): + for batch_idx in range(len(summary_hf[beam_idx])): + metric_hf[beam_idx].add_batch( + predictions=[summary_hf[beam_idx][batch_idx]], + references=[datapoint['highlights'][batch_idx]]) + + logger.debug('-' * 100) + logger.debug(f"Article : {datapoint['article']}") + if test_trt_llm: + logger.debug(f'TensorRT-LLM Summary: {summary_tensorrt_llm}') + if test_hf: + logger.debug(f'HF Summary: {summary_hf}') + logger.debug(f"highlights : {datapoint['highlights']}") + + data_point_idx += max_batch_size + ite_count += 1 + + if runtime_rank == 0: + if test_trt_llm: + np.random.seed(0) # rouge score use sampling to compute the score + logger.info( + f'TensorRT-LLM (total latency: {profiler.elapsed_time_in_sec("tensorrt_llm")} sec)' + ) + for beam_idx in range(num_beams): + logger.info(f"TensorRT-LLM beam {beam_idx} result") + computed_metrics_tensorrt_llm = metric_tensorrt_llm[ + beam_idx].compute() + for key in computed_metrics_tensorrt_llm.keys(): + logger.info( + f' {key} : {computed_metrics_tensorrt_llm[key].mid[2]*100}' + ) + + if args.check_accuracy and beam_idx == 0: + assert computed_metrics_tensorrt_llm['rouge1'].mid[ + 2] * 100 > args.tensorrt_llm_rouge1_threshold + if test_hf: + np.random.seed(0) # rouge score use sampling to compute the score + logger.info( + f'Hugging Face (total latency: {profiler.elapsed_time_in_sec("hf")} sec)' + ) + for beam_idx in range(num_beams): + logger.info(f"HF beam {beam_idx} result") + computed_metrics_hf = metric_hf[beam_idx].compute() + for key in computed_metrics_hf.keys(): + logger.info( + f' {key} : {computed_metrics_hf[key].mid[2]*100}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--hf_model_location', + type=str, + default='internlm-7b-hf') + parser.add_argument('--test_hf', action='store_true') + parser.add_argument('--test_trt_llm', action='store_true') + parser.add_argument('--data_type', + type=str, + choices=['fp32', 'fp16', 'bf16'], + default='auto') + parser.add_argument('--dataset_path', type=str, default='') + parser.add_argument('--log_level', type=str, default='info') + parser.add_argument('--engine_dir', type=str, default='internlm_outputs') + parser.add_argument('--batch_size', type=int, default=1) + parser.add_argument('--max_ite', type=int, default=20) + parser.add_argument('--check_accuracy', action='store_true') + parser.add_argument('--tensorrt_llm_rouge1_threshold', + type=float, + default=15.0) + parser.add_argument('--num_beams', type=int, default=1) + parser.add_argument('--top_k', type=int, default=1) + + args = parser.parse_args() + + main(args) diff --git a/examples/internlm/weight.py b/examples/internlm/weight.py new file mode 100644 index 0000000000..7293962c74 --- /dev/null +++ b/examples/internlm/weight.py @@ -0,0 +1,1318 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import configparser +import math +import time +from operator import attrgetter +from pathlib import Path +from typing import Dict, List, Optional, Union + +import numpy as np +import torch +from safetensors import safe_open + +import tensorrt_llm +import tensorrt_llm.logger as logger +from tensorrt_llm._utils import str_dtype_to_torch, torch_to_numpy +from tensorrt_llm.mapping import Mapping +from tensorrt_llm.models import InternLMForCausalLM +from tensorrt_llm.models.quantized.quant import get_dummy_quant_scales +from tensorrt_llm.quantization import QuantMode + + +def get_scaling_factors( + model_path: Union[str, Path], + num_layers: int, + quant_mode: Optional[QuantMode] = None, +) -> Optional[Dict[str, List[int]]]: + """ Get the scaling factors for InternLM model + + Returns a dictionary of scaling factors for the selected layers of the + InternLM model. + + Args: + model_path (str): Path to the quantized InternLM model + layers (list): List of layers to get the scaling factors for. If None, + all layers are selected. + + Returns: + dict: Dictionary of scaling factors for the selected layers of the + InternLM model. + + example: + + { + 'qkv_act': qkv_act_scale, + 'qkv_weights': qkv_weights_scale, + 'qkv_output' : qkv_outputs_scale, + 'dense_act': dense_act_scale, + 'dense_weights': dense_weights_scale, + 'fc_act': fc_act_scale, + 'fc_weights': fc_weights_scale, + 'gate_act': gate_act_scale, + 'gate_weights': gate_weights_scale, + 'proj_act': proj_act_scale, + 'proj_weights': proj_weights_scale, + } + """ + + if model_path is None: + logger.warning(f"--quantized_fp8_model_path not specified. " + f"Initialize quantization scales automatically.") + return get_dummy_quant_scales(num_layers) + weight_dict = np.load(model_path) + + # yapf: disable + scaling_factor = { + 'qkv_act': [], + 'qkv_weights': [], + 'qkv_output': [], + 'dense_act': [], + 'dense_weights': [], + 'fc_act': [], + 'fc_weights': [], + 'gate_act': [], + 'gate_weights': [], + 'proj_act': [], + 'proj_weights': [], + } + + for layer in range(num_layers): + scaling_factor['qkv_act'].append(max( + weight_dict[f'_np:layers:{layer}:attention:qkv:q:activation_scaling_factor'].item(), + weight_dict[f'_np:layers:{layer}:attention:qkv:k:activation_scaling_factor'].item(), + weight_dict[f'_np:layers:{layer}:attention:qkv:v:activation_scaling_factor'].item() + )) + scaling_factor['qkv_weights'].append(max( + weight_dict[f'_np:layers:{layer}:attention:qkv:q:weights_scaling_factor'].item(), + weight_dict[f'_np:layers:{layer}:attention:qkv:k:weights_scaling_factor'].item(), + weight_dict[f'_np:layers:{layer}:attention:qkv:v:weights_scaling_factor'].item() + )) + if quant_mode is not None and quant_mode.has_fp8_kv_cache(): + # Not calibrarting KV cache. + scaling_factor['qkv_output'].append(1.0) + scaling_factor['dense_act'].append(weight_dict[f'_np:layers:{layer}:attention:dense:activation_scaling_factor'].item()) + scaling_factor['dense_weights'].append(weight_dict[f'_np:layers:{layer}:attention:dense:weights_scaling_factor'].item()) + scaling_factor['fc_act'].append(weight_dict[f'_np:layers:{layer}:mlp:fc:activation_scaling_factor'].item()) + scaling_factor['fc_weights'].append(weight_dict[f'_np:layers:{layer}:mlp:fc:weights_scaling_factor'].item()) + scaling_factor['gate_act'].append(weight_dict[f'_np:layers:{layer}:mlp:gate:activation_scaling_factor'].item()) + scaling_factor['gate_weights'].append(weight_dict[f'_np:layers:{layer}:mlp:gate:weights_scaling_factor'].item()) + scaling_factor['proj_act'].append(weight_dict[f'_np:layers:{layer}:mlp:proj:activation_scaling_factor'].item()) + scaling_factor['proj_weights'].append(weight_dict[f'_np:layers:{layer}:mlp:proj:weights_scaling_factor'].item()) + # yapf: enable + for k, v in scaling_factor.items(): + assert len(v) == num_layers, \ + f'Expect scaling factor {k} of length {num_layers}, got {len(v)}' + + return scaling_factor + + +def gen_suffix(rank, use_smooth_quant, quant_per_channel): + suffix = f"{rank}.bin" + if use_smooth_quant: + sq_prefix = "int8." + if quant_per_channel: + sq_prefix += "col." + suffix = sq_prefix + suffix + return suffix + + +def extract_layer_idx(name): + ss = name.split('.') + for s in ss: + if s.isdigit(): + return s + return None + + +def split(v, tp_size, idx, dim=0): + if tp_size == 1: + return v + if len(v.shape) == 1: + return np.ascontiguousarray(np.split(v, tp_size)[idx].copy()) + else: + return np.ascontiguousarray(np.split(v, tp_size, axis=dim)[idx].copy()) + + +def dup_kv_weight(v, num_head, tp_size): + assert tp_size % num_head == 0 + reps = tp_size // num_head + head_size = v.shape[0] // num_head + v = v.reshape(num_head, head_size, + -1)[:, None, :, :].expand(num_head, reps, head_size, + v.shape[1]) + return v.reshape(num_head * reps * head_size, -1).clone() + + +def parse_ft_config(ini_file): + gpt_config = configparser.ConfigParser() + gpt_config.read(ini_file) + + n_embd = gpt_config.getint('internlm', 'hidden_size') + n_head = gpt_config.getint('internlm', 'num_attention_heads') + n_layer = gpt_config.getint('internlm', 'num_hidden_layers') + n_positions = gpt_config.getint('internlm', 'max_position_embeddings') + vocab_size = gpt_config.getint('internlm', 'vocab_size') + hidden_act = gpt_config.get('internlm', 'hidden_act') + inter_size = gpt_config.getint('internlm', + 'intermediate_size', + fallback=None) + n_kv_head = gpt_config.getint('internlm', + 'num_key_value_heads', + fallback=n_head) + attn_bias = gpt_config.getboolean('internlm', 'bias', fallback=False) + + if inter_size is None: + inter_size = math.ceil(8 / 3 * n_embd / 256) * 256 + + return n_embd, n_head, n_layer, n_positions, vocab_size, hidden_act, inter_size, n_kv_head, attn_bias + + +def load_from_hf_internlm( + tensorrt_llm_internlm: tensorrt_llm.models.InternLMForCausalLM, + hf_internlm, + mapping=Mapping(), + dtype='float32'): + tensorrt_llm.logger.info('Loading weights from HF InternLM...') + tik = time.time() + + quant_mode = getattr(tensorrt_llm_internlm, 'quant_mode', QuantMode(0)) + if quant_mode.is_int8_weight_only(): + plugin_weight_only_quant_type = torch.int8 + elif quant_mode.is_int4_weight_only(): + plugin_weight_only_quant_type = torch.quint4x2 + use_weight_only = quant_mode.is_weight_only() + num_kv_heads = tensorrt_llm_internlm.num_kv_heads + mha_mode = (num_kv_heads == tensorrt_llm_internlm.num_heads) + assert mha_mode, "All InternLM variants should be MHA mode" + + model_params = dict(hf_internlm.named_parameters()) + for l in range(hf_internlm.config.num_hidden_layers): + prefix = f'model.layers.{l}.self_attn.' + q_weight = model_params[prefix + 'q_proj.weight'] + k_weight = model_params[prefix + 'k_proj.weight'] + v_weight = model_params[prefix + 'v_proj.weight'] + + if not mha_mode: + head_size = tensorrt_llm_internlm.hidden_size // tensorrt_llm_internlm.num_heads + if num_kv_heads < mapping.tp_size: + # duplicate the KV heads up to tensor_parallel + k_weight = dup_kv_weight(k_weight, num_kv_heads, + mapping.tp_size) + v_weight = dup_kv_weight(v_weight, num_kv_heads, + mapping.tp_size) + assert (k_weight.shape[0] % (mapping.tp_size * head_size)) == 0 + assert (v_weight.shape[0] % (mapping.tp_size * head_size)) == 0 + qkv_weight = [q_weight, k_weight, v_weight] + else: + qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0) + + model_params[prefix + 'qkv_proj.weight'] = qkv_weight + + if prefix + 'q_proj.bias' in model_params: + # only used in 7B models + # assert not mha_mode, "MHA mode not used in internlm 7B models" + q_bias = model_params[prefix + 'q_proj.bias'] + k_bias = model_params[prefix + 'k_proj.bias'] + v_bias = model_params[prefix + 'v_proj.bias'] + qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0) + model_params[prefix + 'qkv_proj.bias'] = qkv_bias + + torch_dtype = str_dtype_to_torch(dtype) + layers_per_pipeline_stage = hf_internlm.config.num_hidden_layers // mapping.pp_size + layers_range = list( + range(mapping.pp_rank * layers_per_pipeline_stage, + (mapping.pp_rank + 1) * layers_per_pipeline_stage, 1)) + for k, v in model_params.items(): + if isinstance(v, list): + v = [torch_to_numpy(vv.to(torch_dtype).detach().cpu()) for vv in v] + else: + v = torch_to_numpy(v.to(torch_dtype).detach().cpu()) + if 'model.embed_tokens.weight' in k: + if tensorrt_llm_internlm.use_parallel_embedding: + v = split(v, mapping.tp_size, mapping.tp_rank, + tensorrt_llm_internlm.embedding_sharding_dim) + if mapping.is_first_pp_rank(): + tensorrt_llm_internlm.vocab_embedding.weight.value = v + elif 'model.norm.weight' in k: + if mapping.is_last_pp_rank(): + tensorrt_llm_internlm.ln_f.weight.value = v + elif 'lm_head.weight' in k: + if mapping.is_last_pp_rank(): + tensorrt_llm_internlm.lm_head.weight.value = np.ascontiguousarray( + split(v, mapping.tp_size, mapping.tp_rank)) + else: + layer_idx = extract_layer_idx(k) + if layer_idx is None or int(layer_idx) not in layers_range: + continue + idx = int(layer_idx) - mapping.pp_rank * layers_per_pipeline_stage + if idx >= tensorrt_llm_internlm.num_layers: + continue + if 'input_layernorm.weight' in k: + tensorrt_llm_internlm.layers[ + idx].input_layernorm.weight.value = v + elif 'post_attention_layernorm.weight' in k: + dst = tensorrt_llm_internlm.layers[idx].post_layernorm.weight + dst.value = v + elif 'self_attn.qkv_proj.weight' in k: + dst = tensorrt_llm_internlm.layers[idx].attention.qkv.weight + if not mha_mode: + assert isinstance(v, list) and len(v) == 3 + wq = split(v[0], mapping.tp_size, mapping.tp_rank) + wk = split(v[1], mapping.tp_size, mapping.tp_rank) + wv = split(v[2], mapping.tp_size, mapping.tp_rank) + split_v = np.concatenate((wq, wk, wv)) + else: + q_emb = v.shape[0] // 3 + model_emb = v.shape[1] + v = v.reshape(3, q_emb, model_emb) + split_v = split(v, mapping.tp_size, mapping.tp_rank, dim=1) + split_v = split_v.reshape(3 * (q_emb // mapping.tp_size), + model_emb) + if use_weight_only: + v = np.ascontiguousarray(split_v.transpose()) + processed_torch_weights, torch_weight_scales = \ + torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + torch.tensor(v), plugin_weight_only_quant_type) + dst.value = processed_torch_weights.numpy() + scales = tensorrt_llm_internlm.layers[ + idx].attention.qkv.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + dst.value = np.ascontiguousarray(split_v) + elif 'self_attn.qkv_proj.bias' in k: + dst = tensorrt_llm_internlm.layers[idx].attention.qkv.bias + if not mha_mode: + assert isinstance(v, list) and len(v) == 3 + bq = split(v[0], mapping.tp_size, mapping.tp_rank) + bk = split(v[1], mapping.tp_size, mapping.tp_rank) + bv = split(v[2], mapping.tp_size, mapping.tp_rank) + split_v = np.concatenate((bq, bk, bv)) + else: + q_emb = v.shape[0] // 3 + v = v.reshape(3, q_emb) + split_v = split(v, mapping.tp_size, mapping.tp_rank, dim=1) + split_v = split_v.reshape(3 * (q_emb // mapping.tp_size)) + dst.value = np.ascontiguousarray(split_v) + elif 'self_attn.o_proj.weight' in k: + dst = tensorrt_llm_internlm.layers[idx].attention.dense.weight + split_v = split(v, mapping.tp_size, mapping.tp_rank, dim=1) + if use_weight_only: + v = np.ascontiguousarray(split_v.transpose()) + processed_torch_weights, torch_weight_scales = \ + torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + torch.tensor(v), plugin_weight_only_quant_type) + dst.value = processed_torch_weights.numpy() + scales = tensorrt_llm_internlm.layers[ + idx].attention.dense.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + dst.value = np.ascontiguousarray(split_v) + elif 'self_attn.o_proj.bias' in k: + dst = tensorrt_llm_internlm.layers[idx].attention.dense.bias + split_v = v # no need to divide among ranks? + dst.value = np.ascontiguousarray(split_v) + elif 'mlp.up_proj.weight' in k: + dst = tensorrt_llm_internlm.layers[idx].mlp.gate.weight + split_v = split(v, mapping.tp_size, mapping.tp_rank, dim=0) + if use_weight_only: + v = np.ascontiguousarray(split_v.transpose()) + processed_torch_weights, torch_weight_scales = \ + torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + torch.tensor(v), plugin_weight_only_quant_type) + dst.value = processed_torch_weights.numpy() + scales = tensorrt_llm_internlm.layers[ + idx].mlp.gate.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + dst.value = np.ascontiguousarray(split_v) + elif 'mlp.down_proj.weight' in k: + dst = tensorrt_llm_internlm.layers[idx].mlp.proj.weight + split_v = split(v, mapping.tp_size, mapping.tp_rank, dim=1) + if use_weight_only: + v = np.ascontiguousarray(split_v.transpose()) + processed_torch_weights, torch_weight_scales = \ + torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + torch.tensor(v), plugin_weight_only_quant_type) + dst.value = processed_torch_weights.numpy() + scales = tensorrt_llm_internlm.layers[ + idx].mlp.proj.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + dst.value = np.ascontiguousarray(split_v) + elif 'mlp.gate_proj.weight' in k: + dst = tensorrt_llm_internlm.layers[idx].mlp.fc.weight + split_v = split(v, mapping.tp_size, mapping.tp_rank, dim=0) + if use_weight_only: + v = np.ascontiguousarray(split_v.transpose()) + processed_torch_weights, torch_weight_scales = \ + torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + torch.tensor(v), plugin_weight_only_quant_type) + dst.value = processed_torch_weights.numpy() + scales = tensorrt_llm_internlm.layers[ + idx].mlp.fc.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + dst.value = np.ascontiguousarray(split_v) + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}') + return + + +def load_from_meta_internlm( + tensorrt_llm_internlm: tensorrt_llm.models.InternLMForCausalLM, + meta_ckpt_dir, + mapping=Mapping(), + dtype="float32"): + torch_dtype = str_dtype_to_torch(dtype) + + def gather_ckpts(ckpts): + gathered = {} + for k in ckpts[0]: + d = 0 + if any([n in k for n in ["wo", "w2", "tok"]]): + d = 1 + if "norm" in k or "rope" in k: # no TP + gathered[k] = ckpts[0][k].clone() + else: + gathered[k] = torch.cat([pt[k] for pt in ckpts], dim=d).clone() + return gathered + + def split_ckpt(ckpt, ranks_per_ckpt, ckpt_rank): + split_ckpt = {} + for k in ckpt: + d = 0 + if any([n in k for n in ["wo", "w2", "tok"]]): + d = 1 + if "norm" in k or "rope" in k: # no TP + split_ckpt[k] = ckpt[k].clone() + elif tensorrt_llm_internlm.num_kv_heads < mapping.tp_size and any( + [n in k for n in ["wk", "wv"]]): + assert mapping.tp_size % tensorrt_llm_internlm.num_kv_heads == 0 + # special case: we need to duplicate KV head + tmp = dup_kv_weight(ckpt[k], tensorrt_llm_internlm.num_kv_heads, + mapping.tp_size) + split_ckpt[k] = torch.split(tmp, + tmp.shape[d] // ranks_per_ckpt, + dim=d)[ckpt_rank].clone() + else: + split_ckpt[k] = torch.split(ckpt[k], + ckpt[k].shape[d] // ranks_per_ckpt, + dim=d)[ckpt_rank].clone() + return split_ckpt + + def get_current_weights(num_ckpts): + if num_ckpts > mapping.tp_size: + # combine ckpts + assert (num_ckpts % mapping.tp_size) == 0 + nf = num_ckpts // mapping.tp_size + fs = nf * mapping.tp_rank + file_ids = list(range(fs, fs + nf)) + ckpts = [] + for f in file_ids: + ckpt = torch.load(Path(meta_ckpt_dir, + f"consolidated.{f:02d}.pth"), + map_location="cpu") + ckpts.append(ckpt) + return gather_ckpts(ckpts) + elif num_ckpts < mapping.tp_size: + # split ckpt + assert (mapping.tp_size % num_ckpts) == 0 + ranks_per_ckpt = mapping.tp_size // num_ckpts + ckpt_fid = mapping.tp_rank // ranks_per_ckpt + ckpt_rank = mapping.tp_rank % ranks_per_ckpt + nH_per_ckpt = tensorrt_llm_internlm.num_heads // num_ckpts + assert (nH_per_ckpt % ranks_per_ckpt) == 0 + ckpt = torch.load(Path(meta_ckpt_dir, + f"consolidated.{ckpt_fid:02d}.pth"), + map_location="cpu") + return split_ckpt(ckpt, ranks_per_ckpt, ckpt_rank) + + # num_ckpts == tensor_parallel, 1:1 mapping from files to TP + return torch.load(Path(meta_ckpt_dir, + f"consolidated.{mapping.tp_rank:02d}.pth"), + map_location="cpu") + + def permute(w, nH, d, dH): + # due to MQA's wk, nH*dH != d could be true + return w.view(nH, dH // 2, 2, d).transpose(1, 2).reshape(nH * dH, d) + + if not hasattr(load_from_meta_internlm, "saved_embed"): + load_from_meta_internlm.saved_embed = None + + def gather_embedding(cur_embed, name: str, num_ckpts): + if mapping.tp_size == 1: + # even if num_ckpts > 1, get_current_weights will already have it gathered + return cur_embed + if load_from_meta_internlm.saved_embed is None: + embeds = [None] * num_ckpts + for i in range(num_ckpts): + ckpt = torch.load(Path(meta_ckpt_dir, + f"consolidated.{i:02d}.pth"), + map_location="cpu") + embeds[i] = ckpt[name] + embed = torch.cat(embeds, dim=1).to(torch_dtype) + load_from_meta_internlm.saved_embed = torch_to_numpy( + embed) # cache the embedding, not needed if no refit + return load_from_meta_internlm.saved_embed + + tensorrt_llm.logger.info( + 'Loading weights from Meta InternLM checkpoints ...') + tik = time.time() + + quant_mode = getattr(tensorrt_llm_internlm, 'quant_mode', QuantMode(0)) + if quant_mode.is_int8_weight_only(): + torch.int8 + elif quant_mode.is_int4_weight_only(): + torch.quint4x2 + quant_mode.is_weight_only() + num_kv_heads = tensorrt_llm_internlm.num_kv_heads + mha_mode = (num_kv_heads == tensorrt_llm_internlm.num_heads) + + ckpts = list(Path(meta_ckpt_dir).glob("consolidated.*.pth")) + num_ckpts = len(ckpts) + # internlm/internlm2 doesn't have MQA. So, simplifying loader logic by not worrying about it. + assert num_kv_heads > 1 or num_kv_heads >= num_ckpts, \ + f"We don't know how the {num_kv_heads} KV heads are distributed among {num_ckpts} checkpoints." + + head_size = tensorrt_llm_internlm.hidden_size // tensorrt_llm_internlm.num_heads + ckpt = get_current_weights(num_ckpts) + layers_range = list( + range(mapping.pp_rank * tensorrt_llm_internlm.num_layers, + (mapping.pp_rank + 1) * tensorrt_llm_internlm.num_layers, 1)) + + for l in layers_range: + prefix = f'layers.{l}.attention.' + q_weight = permute(ckpt[prefix + 'wq.weight'].clone(), + nH=(tensorrt_llm_internlm.num_heads // + mapping.tp_size), + d=tensorrt_llm_internlm.hidden_size, + dH=head_size) + if num_kv_heads < mapping.tp_size and num_ckpts >= mapping.tp_size: + assert mapping.tp_size % num_kv_heads == 0 + assert False, "Not supported yet" + k_weight = permute(ckpt[prefix + 'wk.weight'].clone(), + nH=((num_kv_heads + mapping.tp_size - 1) // + mapping.tp_size), + d=tensorrt_llm_internlm.hidden_size, + dH=head_size) + v_weight = ckpt[prefix + 'wv.weight'].clone() + + qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0) + ckpt[prefix + 'qkv.weight'] = qkv_weight + + for k, v in ckpt.items(): + v = torch_to_numpy(v.to(torch_dtype).detach().cpu()) + if "tok_embeddings" in k: + if not tensorrt_llm_internlm.use_parallel_embedding: + v = gather_embedding(v, k, num_ckpts) + elif tensorrt_llm_internlm.embedding_sharding_dim == 0: + # this needs a gather and then resplit along different dims + v = gather_embedding(v, k, num_ckpts) + v = split(v, mapping.tp_size, mapping.tp_rank, 0) + if mapping.is_first_pp_rank(): + tensorrt_llm_internlm.vocab_embedding.weight.value = v + elif "output" in k: + if mapping.is_last_pp_rank(): + tensorrt_llm_internlm.lm_head.weight.value = v + elif k == "norm.weight": + if mapping.is_last_pp_rank(): + tensorrt_llm_internlm.ln_f.weight.value = v + else: + # layer specific weights + layer_idx = extract_layer_idx(k) + if layer_idx is None: + continue + idx = int( + layer_idx) - mapping.pp_rank * tensorrt_llm_internlm.num_layers + if idx >= tensorrt_llm_internlm.num_layers: + continue + if 'attention_norm.weight' in k: + tensorrt_llm_internlm.layers[ + idx].input_layernorm.weight.value = v + elif 'ffn_norm.weight' in k: + tensorrt_llm_internlm.layers[ + idx].post_layernorm.weight.value = v + elif 'feed_forward.w3.weight' in k: + tensorrt_llm_internlm.layers[idx].mlp.gate.weight.value = v + elif 'feed_forward.w2.weight' in k: + tensorrt_llm_internlm.layers[idx].mlp.proj.weight.value = v + elif 'feed_forward.w1.weight' in k: + tensorrt_llm_internlm.layers[idx].mlp.fc.weight.value = v + elif 'attention.wo.weight' in k: + tensorrt_llm_internlm.layers[ + idx].attention.dense.weight.value = v + elif 'attention.qkv.weight' in k: + tensorrt_llm_internlm.layers[idx].attention.qkv.weight.value = v + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}') + return + + +def load_from_binary(tensorrt_llm_internlm: InternLMForCausalLM, + dir_path, + mapping=Mapping(), + fp16=False, + multi_query_mode=False): + tensorrt_llm.logger.info('Loading weights from FT...') + tik = time.time() + + quant_mode = getattr(tensorrt_llm_internlm, 'quant_mode', QuantMode(0)) + + n_embd, n_head, n_layer, n_positions, vocab_size, hidden_act, inter_size, n_kv_head, attn_bias = parse_ft_config( + Path(dir_path) / 'config.ini') + np_dtype = np.float16 if fp16 else np.float32 + + def fromfile(dir_path, name, shape=None, dtype=None): + dtype = np_dtype if dtype is None else dtype + p = dir_path + '/' + name + print(f"Loading from {str(p)}") + if Path(p).exists(): + t = np.fromfile(p, dtype=dtype) + if shape is not None: + t = t.reshape(shape) + return t + return None + + def set_smoothquant_scale_factors(module, + pre_scale_weight, + dir_path, + basename, + shape, + per_tok_dyn, + per_channel, + is_qkv=False, + rank=None): + suffix = "bin" + if per_channel: + if rank is not None: + suffix = f"{rank}." + suffix + suffix = "col." + suffix + + col_shape = shape if (per_channel or is_qkv) else [1, 1] + + if per_tok_dyn: + if pre_scale_weight is not None: + pre_scale_weight.value = np.array([1.0], dtype=np.float32) + if is_qkv and not per_channel: + t = fromfile(dir_path, + f"{basename}scale_w_quant_orig.{rank}.{suffix}", + col_shape, np.float32) + else: + t = fromfile(dir_path, f"{basename}scale_w_quant_orig.{suffix}", + col_shape, np.float32) + module.per_channel_scale.value = t + else: + t = fromfile(dir_path, f"{basename}scale_x_orig_quant.bin", [1], + np.float32) + pre_scale_weight.value = t + if is_qkv: + t = fromfile(dir_path, + f"{basename}scale_y_accum_quant.{rank}.{suffix}", + col_shape, np.float32) + else: + t = fromfile(dir_path, + f"{basename}scale_y_accum_quant.{suffix}", + col_shape, np.float32) + module.per_channel_scale.value = t + t = fromfile(dir_path, f"{basename}scale_y_quant_orig.bin", [1, 1], + np.float32) + module.act_scale.value = t + + def set_smoother(module, dir_path, base_name, shape, rank): + suffix = f"{rank}.bin" + t = fromfile(dir_path, f"{base_name}.smoother.{suffix}", shape, + np.float32) + module.smoother.value = t + + # Determine the quantization mode. + quant_mode = getattr(tensorrt_llm_internlm, "quant_mode", QuantMode(0)) + if quant_mode.is_int8_weight_only(): + plugin_weight_only_quant_type = torch.int8 + elif quant_mode.is_int4_weight_only(): + plugin_weight_only_quant_type = torch.quint4x2 + # Do we use SmoothQuant? + use_smooth_quant = quant_mode.has_act_and_weight_quant() + # Do we use quantization per token? + quant_per_token_dyn = quant_mode.has_per_token_dynamic_scaling() + # Do we use quantization per channel? + quant_per_channel = quant_mode.has_per_channel_scaling() + + # Do we use INT4/INT8 weight-only? + use_weight_only = quant_mode.is_weight_only() + + # Int8 KV cache + use_int8_kv_cache = quant_mode.has_int8_kv_cache() + + # Debug + suffix = gen_suffix(mapping.tp_rank, use_smooth_quant, quant_per_channel) + # The type of weights. + w_type = np_dtype if not use_smooth_quant else np.int8 + + if mapping.is_first_pp_rank(): + tensorrt_llm_internlm.vocab_embedding.weight.value = (fromfile( + dir_path, 'vocab_embedding.weight.bin', [vocab_size, n_embd])) + + if mapping.is_last_pp_rank(): + tensorrt_llm_internlm.ln_f.weight.value = (fromfile( + dir_path, 'ln_f.weight.bin')) + # share input embedding + lm_head_weight = fromfile(dir_path, 'lm_head.weight.bin', + [vocab_size, n_embd]) + + if vocab_size % mapping.tp_size != 0: + # padding + vocab_size_padded = tensorrt_llm_internlm.lm_head.out_features * mapping.tp_size + pad_width = vocab_size_padded - vocab_size + lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)), + 'constant', + constant_values=0) + if mapping.is_last_pp_rank(): + tensorrt_llm_internlm.lm_head.weight.value = np.ascontiguousarray( + split(lm_head_weight, mapping.tp_size, mapping.tp_rank)) + + layers_range = list( + range(mapping.pp_rank * tensorrt_llm_internlm.num_layers, + (mapping.pp_rank + 1) * tensorrt_llm_internlm.num_layers, 1)) + + for i in layers_range: + n_groups = n_head // n_kv_head + c_attn_out_dim = ( + 3 * n_embd // mapping.tp_size) if not multi_query_mode else ( + n_embd // mapping.tp_size + + (n_embd // n_head * n_groups) // mapping.tp_size * 2) + idx = i - mapping.pp_rank * tensorrt_llm_internlm.num_layers + tensorrt_llm_internlm.layers[idx].input_layernorm.weight.value = ( + fromfile(dir_path, + 'model.layers.' + str(i) + '.input_layernorm.weight.bin')) + + t = fromfile( + dir_path, 'model.layers.' + str(i) + + '.attention.query_key_value.weight.' + suffix, + [n_embd, c_attn_out_dim], w_type) + if t is not None: + dst = tensorrt_llm_internlm.layers[idx].attention.qkv.weight + if use_smooth_quant: + dst.value = (np.ascontiguousarray(np.transpose(t, [1, 0]))) + set_smoothquant_scale_factors( + tensorrt_llm_internlm.layers[idx].attention.qkv, + tensorrt_llm_internlm.layers[idx].input_layernorm. + scale_to_int, + dir_path, + 'model.layers.' + str(i) + '.attention.query_key_value.', + [1, c_attn_out_dim], + quant_per_token_dyn, + quant_per_channel, + rank=mapping.tp_rank, + is_qkv=True) + elif use_weight_only: + processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + torch.tensor(t), plugin_weight_only_quant_type) + # workaround for trt not supporting int8 inputs in plugins currently + dst.value = processed_torch_weights.view( + dtype=torch.float32).numpy() + scales = tensorrt_llm_internlm.layers[ + i].attention.qkv.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + dst.value = np.ascontiguousarray(np.transpose(t, [1, 0])) + + dst = tensorrt_llm_internlm.layers[idx].attention.dense.weight + t = fromfile( + dir_path, + 'model.layers.' + str(i) + '.attention.dense.weight.' + suffix, + [n_embd // mapping.tp_size, n_embd], w_type) + if use_smooth_quant: + dst.value = (np.ascontiguousarray(np.transpose(t, [1, 0]))) + dense_scale = getattr(tensorrt_llm_internlm.layers[idx].attention, + "quantization_scaling_factor", None) + set_smoothquant_scale_factors( + tensorrt_llm_internlm.layers[idx].attention.dense, dense_scale, + dir_path, 'model.layers.' + str(i) + '.attention.dense.', + [1, n_embd], quant_per_token_dyn, quant_per_channel) + set_smoother(tensorrt_llm_internlm.layers[idx].attention.dense, + dir_path, + 'model.layers.' + str(i) + '.attention.dense', + [1, n_embd // mapping.tp_size], mapping.tp_rank) + elif use_weight_only: + processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + torch.tensor(t), plugin_weight_only_quant_type) + dst.value = processed_torch_weights.numpy() + scales = tensorrt_llm_internlm.layers[ + i].attention.dense.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + dst.value = np.ascontiguousarray(np.transpose(t, [1, 0])) + + if attn_bias: + dst = tensorrt_llm_internlm.layers[idx].attention.qkv.bias + t = fromfile( + dir_path, 'model.layers.' + str(i) + + f'.attention.query_key_value.bias.{mapping.tp_rank}.bin') + dst.value = np.ascontiguousarray(t) + + dst = tensorrt_llm_internlm.layers[idx].attention.dense.bias + t = fromfile(dir_path, + 'model.layers.' + str(i) + '.attention.dense.bias.bin') + dst.value = np.ascontiguousarray(t) + + dst = tensorrt_llm_internlm.layers[idx].post_layernorm.weight + dst.value = fromfile( + dir_path, 'model.layers.' + str(i) + '.post_layernorm.weight.bin') + + t = fromfile(dir_path, + 'model.layers.' + str(i) + '.mlp.fc.weight.' + suffix, + [n_embd, inter_size // mapping.tp_size], w_type) + + if use_smooth_quant: + tensorrt_llm_internlm.layers[idx].mlp.fc.weight.value = ( + np.ascontiguousarray(np.transpose(t, [1, 0]))) + set_smoothquant_scale_factors( + tensorrt_llm_internlm.layers[idx].mlp.fc, + tensorrt_llm_internlm.layers[idx].post_layernorm.scale_to_int, + dir_path, + 'model.layers.' + str(i) + '.mlp.fc.', + [1, inter_size // mapping.tp_size], + quant_per_token_dyn, + quant_per_channel, + rank=mapping.tp_rank) + elif use_weight_only: + dst = tensorrt_llm_internlm.layers[i].mlp.fc.weight + processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + torch.tensor(t), plugin_weight_only_quant_type) + dst.value = processed_torch_weights.numpy() + scales = tensorrt_llm_internlm.layers[i].mlp.fc.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + tensorrt_llm_internlm.layers[ + idx].mlp.fc.weight.value = np.ascontiguousarray( + np.transpose(t, [1, 0])) + + t = fromfile(dir_path, + 'model.layers.' + str(i) + '.mlp.gate.weight.' + suffix, + [n_embd, inter_size // mapping.tp_size], w_type) + if use_smooth_quant: + tensorrt_llm_internlm.layers[idx].mlp.gate.weight.value = ( + np.ascontiguousarray(np.transpose(t, [1, 0]))) + set_smoothquant_scale_factors( + tensorrt_llm_internlm.layers[idx].mlp.gate, + tensorrt_llm_internlm.layers[idx].post_layernorm.scale_to_int, + dir_path, + 'model.layers.' + str(i) + '.mlp.gate.', + [1, inter_size // mapping.tp_size], + quant_per_token_dyn, + quant_per_channel, + rank=mapping.tp_rank) + elif use_weight_only: + dst = tensorrt_llm_internlm.layers[i].mlp.gate.weight + processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + torch.tensor(t), plugin_weight_only_quant_type) + dst.value = processed_torch_weights.numpy() + scales = tensorrt_llm_internlm.layers[i].mlp.gate.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + tensorrt_llm_internlm.layers[ + idx].mlp.gate.weight.value = np.ascontiguousarray( + np.transpose(t, [1, 0])) + + t = fromfile(dir_path, + 'model.layers.' + str(i) + '.mlp.proj.weight.' + suffix, + [inter_size // mapping.tp_size, n_embd], w_type) + if use_smooth_quant: + tensorrt_llm_internlm.layers[idx].mlp.proj.weight.value = ( + np.ascontiguousarray(np.transpose(t, [1, 0]))) + proj_scale = getattr(tensorrt_llm_internlm.layers[idx].mlp, + "quantization_scaling_factor", None) + set_smoothquant_scale_factors( + tensorrt_llm_internlm.layers[idx].mlp.proj, proj_scale, + dir_path, 'model.layers.' + str(i) + '.mlp.proj.', [1, n_embd], + quant_per_token_dyn, quant_per_channel) + set_smoother(tensorrt_llm_internlm.layers[idx].mlp.proj, dir_path, + 'model.layers.' + str(i) + '.mlp.proj', + [1, inter_size // mapping.tp_size], mapping.tp_rank) + elif use_weight_only: + dst = tensorrt_llm_internlm.layers[i].mlp.proj.weight + processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix( + torch.tensor(t), plugin_weight_only_quant_type) + dst.value = processed_torch_weights.numpy() + scales = tensorrt_llm_internlm.layers[i].mlp.proj.per_channel_scale + scales.value = torch_weight_scales.numpy() + else: + tensorrt_llm_internlm.layers[idx].mlp.proj.weight.value = ( + np.ascontiguousarray(np.transpose(t, [1, 0]))) + + if use_int8_kv_cache: + t = fromfile( + dir_path, 'model.layers.' + str(i) + + '.attention.query_key_value.scale_y_quant_orig.bin', [1], + np.float32) + tensorrt_llm_internlm.layers[ + idx].attention.kv_orig_quant_scale.value = 1.0 / t + tensorrt_llm_internlm.layers[ + idx].attention.kv_quant_orig_scale.value = t + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}') + + +def load_from_gptq_internlm(tensorrt_llm_internlm, + quant_ckpt_path, + mapping=Mapping(), + dtype="float16"): + tensorrt_llm.logger.info( + 'Loading weights from groupwise GPTQ InternLM safetensors...') + tik = time.time() + + if quant_ckpt_path.endswith(".safetensors"): + groupwise_qweight_safetensors = safe_open(quant_ckpt_path, + framework="pt", + device=0) + model_params = { + key: groupwise_qweight_safetensors.get_tensor(key) + for key in groupwise_qweight_safetensors.keys() + } + elif quant_ckpt_path.endswith(".pt"): + model_params = torch.load(quant_ckpt_path, + map_location=torch.device('cpu')) + else: + assert False, "Quantized checkpoint format not supported!" + + def unpack_int32_into_int8(w_packed): + # Unpack inputs packed in int32/float32 into uint4 and store them in int8 format + w_packed_int4x2 = w_packed.contiguous().view(torch.uint8) + w_unpacked = torch.zeros(w_packed_int4x2.shape[0], + w_packed_int4x2.shape[1] * 2, + dtype=torch.int8) + w_unpacked[:, ::2] = w_packed_int4x2 % 16 + w_unpacked[:, 1::2] = w_packed_int4x2 // 16 + return w_unpacked.contiguous() + + def preprocess_groupwise_weight_params(weight_name, + qweight_int32=None, + qzeros_int32=None, + scales_fp16=None): + if weight_name is not None: + qweight_int32 = model_params[weight_name].cpu() + qzeros_int32 = model_params[weight_name[:-7] + 'qzeros'].cpu() + scales_fp16 = model_params[weight_name[:-7] + 'scales'].cpu() + + UINT4_TO_INT4_FLAG = 1 + GPTQ_FLAG = 1 + packer = torch.ops.fastertransformer.pack_int8_tensor_to_packed_int4 + preprocessor = torch.ops.fastertransformer.preprocess_weights_for_mixed_gemm + + qweight_unpacked_int8 = unpack_int32_into_int8( + qweight_int32.T).T.contiguous() - 8 + qweight_interleaved = preprocessor(packer(qweight_unpacked_int8), + torch.quint4x2).view(torch.float32) + # zeros = zeros * scales + qzeros_unpacked_int32 = unpack_int32_into_int8(qzeros_int32) + zeros_x_scales_fp16 = (-qzeros_unpacked_int32 + 8 * UINT4_TO_INT4_FLAG - + GPTQ_FLAG) * scales_fp16 + zeros_x_scales_fp16 = zeros_x_scales_fp16.half() + + # return processed interleaved weight, original scales and zeros * scales + return qweight_interleaved.contiguous(), scales_fp16.contiguous( + ), zeros_x_scales_fp16.contiguous() + + layer_ids = [ + extract_layer_idx(key) for key in groupwise_qweight_safetensors.keys() + ] + layer_ids = [ + int(layer_idx) for layer_idx in layer_ids if layer_idx is not None + ] + num_hidden_layers = max(layer_ids) + 1 + num_kv_heads = tensorrt_llm_internlm.num_kv_heads + mha_mode = (num_kv_heads == tensorrt_llm_internlm.num_heads) + suffixs = ['qweight', 'qzeros', 'scales'] + + layers_per_pipeline_stage = num_hidden_layers // mapping.pp_size + layers_range = list( + range(mapping.pp_rank * layers_per_pipeline_stage, + (mapping.pp_rank + 1) * layers_per_pipeline_stage, 1)) + + for l in layers_range: + prefix = f'model.layers.{l}.self_attn.' + split_qkv_suf = [] + + for suf in suffixs: + q_part = model_params[prefix + 'q_proj.' + suf].cpu() + k_part = model_params[prefix + 'k_proj.' + suf].cpu() + v_part = model_params[prefix + 'v_proj.' + suf].cpu() + qkv_part = torch.cat([q_part, k_part, v_part], dim=0) + dim = qkv_part.shape + qkv_part = qkv_part.reshape(3, dim[0] // 3, dim[1]) + split_qkv = qkv_part.split(dim[1] // mapping.tp_size, + dim=2)[mapping.tp_rank] + split_qkv = torch.cat([ + split_qkv[0, :, :].squeeze(0), split_qkv[1, :, :].squeeze(0), + split_qkv[2, :, :].squeeze(0) + ], + dim=1) + split_qkv_suf.append(split_qkv) + + th_qweight, th_zero, th_scale = preprocess_groupwise_weight_params( + None, split_qkv_suf[0], split_qkv_suf[1], split_qkv_suf[2]) + + idx = l - mapping.pp_rank * layers_per_pipeline_stage + tensorrt_llm_internlm.layers[ + idx].attention.qkv.qweight.value = th_qweight.numpy() + tensorrt_llm_internlm.layers[ + idx].attention.qkv.scale.value = th_zero.numpy() + tensorrt_llm_internlm.layers[ + idx].attention.qkv.zero.value = th_scale.numpy() + + torch_dtype = str_dtype_to_torch(dtype) + + for k, v in model_params.items(): + if isinstance(v, list): + v = [torch_to_numpy(vv.to(torch_dtype).detach().cpu()) for vv in v] + else: + v = torch_to_numpy(v.to(torch_dtype).detach().cpu()) + if 'model.embed_tokens.weight' in k: + if mapping.is_first_pp_rank(): + tensorrt_llm_internlm.vocab_embedding.weight.value = v + elif 'model.norm.weight' in k: + if mapping.is_last_pp_rank(): + tensorrt_llm_internlm.ln_f.weight.value = v + elif 'lm_head.weight' in k: + if mapping.is_last_pp_rank(): + tensorrt_llm_internlm.lm_head.weight.value = np.ascontiguousarray( + split(v, mapping.tp_size, mapping.tp_rank)) + else: + layer_idx = extract_layer_idx(k) + if layer_idx is None: + continue + idx = int(layer_idx) + if idx not in layers_range: + continue + idx = idx - mapping.pp_rank * layers_per_pipeline_stage + + if 'input_layernorm.weight' in k: + tensorrt_llm_internlm.layers[ + idx].input_layernorm.weight.value = v + elif 'post_attention_layernorm.weight' in k: + tensorrt_llm_internlm.layers[ + idx].post_layernorm.weight.value = v + elif 'self_attn.o_proj.qweight' in k: + split_v_suf = [] + for suf in suffixs: + v = model_params[k[:-7] + suf].cpu() + split_v = v.split(v.shape[0] // mapping.tp_size, + dim=0)[mapping.tp_rank] + split_v_suf.append(split_v) + th_qweight, th_zero, th_scale = preprocess_groupwise_weight_params( + None, split_v_suf[0], split_v_suf[1], split_v_suf[2]) + tensorrt_llm_internlm.layers[ + idx].attention.dense.qweight.value = th_qweight.numpy() + tensorrt_llm_internlm.layers[ + idx].attention.dense.scale.value = th_zero.numpy() + tensorrt_llm_internlm.layers[ + idx].attention.dense.zero.value = th_scale.numpy() + elif 'mlp.up_proj.qweight' in k: + split_v_suf = [] + for suf in suffixs: + v = model_params[k[:-7] + suf].cpu() + split_v = v.split(v.shape[1] // mapping.tp_size, + dim=1)[mapping.tp_rank] + split_v_suf.append(split_v) + th_qweight, th_zero, th_scale = preprocess_groupwise_weight_params( + None, split_v_suf[0], split_v_suf[1], split_v_suf[2]) + tensorrt_llm_internlm.layers[ + idx].mlp.gate.qweight.value = th_qweight.numpy() + tensorrt_llm_internlm.layers[ + idx].mlp.gate.scale.value = th_zero.numpy() + tensorrt_llm_internlm.layers[ + idx].mlp.gate.zero.value = th_scale.numpy() + elif 'mlp.down_proj.qweight' in k: + split_v_suf = [] + for suf in suffixs: + v = model_params[k[:-7] + suf].cpu() + split_v = v.split(v.shape[0] // mapping.tp_size, + dim=0)[mapping.tp_rank] + split_v_suf.append(split_v) + th_qweight, th_zero, th_scale = preprocess_groupwise_weight_params( + None, split_v_suf[0], split_v_suf[1], split_v_suf[2]) + tensorrt_llm_internlm.layers[ + idx].mlp.proj.qweight.value = th_qweight.numpy() + tensorrt_llm_internlm.layers[ + idx].mlp.proj.scale.value = th_zero.numpy() + tensorrt_llm_internlm.layers[ + idx].mlp.proj.zero.value = th_scale.numpy() + elif 'mlp.gate_proj.qweight' in k: + split_v_suf = [] + for suf in suffixs: + v = model_params[k[:-7] + suf].cpu() + split_v = v.split(v.shape[1] // mapping.tp_size, + dim=1)[mapping.tp_rank] + split_v_suf.append(split_v) + th_qweight, th_zero, th_scale = preprocess_groupwise_weight_params( + None, split_v_suf[0], split_v_suf[1], split_v_suf[2]) + tensorrt_llm_internlm.layers[ + idx].mlp.fc.qweight.value = th_qweight.numpy() + tensorrt_llm_internlm.layers[ + idx].mlp.fc.scale.value = th_zero.numpy() + tensorrt_llm_internlm.layers[ + idx].mlp.fc.zero.value = th_scale.numpy() + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}') + return + + +def load_from_awq_internlm(tensorrt_llm_internlm: InternLMForCausalLM, + quant_ckpt_path, + mapping=Mapping(), + dtype="float16"): + tensorrt_llm.logger.info( + 'Loading weights from groupwise AWQ InternLM safetensors...') + tik = time.time() + + if quant_ckpt_path.endswith(".safetensors"): + groupwise_qweight_safetensors = safe_open(quant_ckpt_path, + framework="pt", + device=0) + awq_internlm = { + key: groupwise_qweight_safetensors.get_tensor(key) + for key in groupwise_qweight_safetensors.keys() + } + elif quant_ckpt_path.endswith(".pt"): + awq_internlm = torch.load(quant_ckpt_path, + map_location=torch.device('cpu')) + else: + assert False, "Quantized checkpoint format not supported!" + + group_size = awq_internlm["model.layers.0.self_attn.o_proj.weight"].numel( + ) // awq_internlm[ + "model.layers.0.self_attn.o_proj.weight_quantizer._amax"].numel() + + awq_internlm_block_names = [ + "input_layernorm.weight", + "post_attention_layernorm.weight", + ] + + tensorrt_llm_internlm_block_names = [ + "input_layernorm.weight", + "post_layernorm.weight", + ] + + getattr(tensorrt_llm_internlm, 'quant_mode', QuantMode(0)) + + packer = torch.ops.fastertransformer.pack_int8_tensor_to_packed_int4 + preprocessor = torch.ops.fastertransformer.preprocess_weights_for_mixed_gemm + torch_dtype = str_dtype_to_torch(dtype) + + def AWQ_quantize_pack_preprocess(weight, scale): + scale = scale.repeat_interleave(group_size, dim=0) + weight = weight / scale + qweight_int8 = torch.clamp(torch.round(weight.cuda()).char(), -8, 7) + int4_weight = packer(qweight_int8.cpu()) + int4_weight = preprocessor(int4_weight, torch.quint4x2) + return int4_weight.view(torch.float32).cpu().numpy() + + def process_and_assign_weight(awq_internlm, mPrefix, mOp, tp_dim=0): + weight = awq_internlm[mPrefix + ".weight"].T.contiguous() + [k, n] = weight.shape + weight = weight.split(weight.shape[tp_dim] // mapping.tp_size, + dim=tp_dim)[mapping.tp_rank] + amax = awq_internlm[mPrefix + ".weight_quantizer._amax"].reshape( + (n, int(k / group_size))).T.contiguous() + amax = amax.split(amax.shape[tp_dim] // mapping.tp_size, + dim=tp_dim)[mapping.tp_rank] + pre_quant_scale = awq_internlm[ + mPrefix + ".input_quantizer._pre_quant_scale"].reshape((1, k)) + if tp_dim == 0: + pre_quant_scale = pre_quant_scale.split(k // mapping.tp_size, + dim=1)[mapping.tp_rank] + scale = amax / 8.0 + mOp.qweight.value = AWQ_quantize_pack_preprocess(weight, scale) + mOp.scale.value = scale.to(torch_dtype).cpu().numpy() + mOp.pre_quant_scale.value = pre_quant_scale.to( + torch_dtype).cpu().numpy() + + def deSmooth(weight, pre_quant_scale): + [k, n] = weight.shape + pre_quant_scale = pre_quant_scale.repeat( + (n, 1)).transpose(1, 0).contiguous() + weight = weight * pre_quant_scale + return weight + + def reSmooth(weight, pre_quant_scale): + [k, n] = weight.shape + pre_quant_scale = pre_quant_scale.repeat( + (n, 1)).transpose(1, 0).contiguous() + weight = weight / pre_quant_scale + return weight + + def get_scale(weight): + weight = weight.T.contiguous() + [n, k] = weight.shape + weight = weight.reshape(n, int(k / group_size), group_size) + weight = torch.abs(weight.reshape(-1, group_size)) + amax, idx = weight.max(1) + amax = amax.reshape(n, int(k / group_size)).T.contiguous() + return amax / 8 + + def reSmooth_and_get_scale(weight, pre_quant_scale, avg_pre_quant_scale): + weight = deSmooth(weight, pre_quant_scale) + weight = reSmooth(weight, avg_pre_quant_scale) + scale = get_scale(weight) + return weight, scale + + def process_and_assign_qkv_weight(awq_internlm, prefix, mOp): + q_weight = awq_internlm[prefix + + "self_attn.q_proj.weight"].T.contiguous() + k_weight = awq_internlm[prefix + + "self_attn.k_proj.weight"].T.contiguous() + v_weight = awq_internlm[prefix + + "self_attn.v_proj.weight"].T.contiguous() + k = q_weight.shape[0] + + q_weight = q_weight.split(q_weight.shape[1] // mapping.tp_size, + dim=1)[mapping.tp_rank] + k_weight = k_weight.split(k_weight.shape[1] // mapping.tp_size, + dim=1)[mapping.tp_rank] + v_weight = v_weight.split(v_weight.shape[1] // mapping.tp_size, + dim=1)[mapping.tp_rank] + + q_pre_quant_scale = awq_internlm[ + prefix + + "self_attn.q_proj.input_quantizer._pre_quant_scale"].reshape((1, k)) + k_pre_quant_scale = awq_internlm[ + prefix + + "self_attn.k_proj.input_quantizer._pre_quant_scale"].reshape((1, k)) + v_pre_quant_scale = awq_internlm[ + prefix + + "self_attn.v_proj.input_quantizer._pre_quant_scale"].reshape((1, k)) + + qkv_pre_quant_scale = (q_pre_quant_scale + k_pre_quant_scale + + v_pre_quant_scale) / 3.0 + q_weight, q_scale = reSmooth_and_get_scale(q_weight, q_pre_quant_scale, + qkv_pre_quant_scale) + k_weight, k_scale = reSmooth_and_get_scale(k_weight, k_pre_quant_scale, + qkv_pre_quant_scale) + v_weight, v_scale = reSmooth_and_get_scale(v_weight, v_pre_quant_scale, + qkv_pre_quant_scale) + + qkv_weights = torch.cat((q_weight, k_weight, v_weight), dim=1) + qkv_scale = torch.cat((q_scale, k_scale, v_scale), dim=1) + + mOp.pre_quant_scale.value = qkv_pre_quant_scale.to( + torch_dtype).cpu().numpy() + mOp.qweight.value = AWQ_quantize_pack_preprocess(qkv_weights, qkv_scale) + mOp.scale.value = qkv_scale.to(torch_dtype).cpu().numpy() + + # Check if we need to pad vocab + v = awq_internlm.get('model.embed_tokens.weight') + [vocab_size, k] = v.shape + pad_vocab = False + pad_vocab_size = vocab_size + if vocab_size % 64 != 0: + pad_vocab = True + pad_vocab_size = int((vocab_size + 63) / 64) * 64 + if pad_vocab: + new_v = torch.zeros([pad_vocab_size, k]) + new_v[:vocab_size, :] = v + v = new_v + if mapping.is_first_pp_rank(): + tensorrt_llm_internlm.vocab_embedding.weight.value = v.to( + torch_dtype).cpu().numpy() + + layer_ids = [extract_layer_idx(key) for key in awq_internlm.keys()] + layer_ids = [ + int(layer_idx) for layer_idx in layer_ids if layer_idx is not None + ] + + num_hidden_layers = max(layer_ids) + 1 + layers_per_pipeline_stage = num_hidden_layers // mapping.pp_size + layers_range = list( + range(mapping.pp_rank * layers_per_pipeline_stage, + (mapping.pp_rank + 1) * layers_per_pipeline_stage, 1)) + + for layer_idx in layers_range: + prefix = "model.layers." + str(layer_idx) + "." + tensorrt_llm.logger.info(f'Process weights in layer: {layer_idx}') + for idx, awq_attr in enumerate(awq_internlm_block_names): + v = awq_internlm[prefix + awq_attr] + layer = attrgetter(tensorrt_llm_internlm_block_names[idx])( + tensorrt_llm_internlm.layers[layer_idx]) + setattr(layer, 'value', v.to(torch_dtype).cpu().numpy()) + + # Attention QKV Linear + # concatenate the Q, K, V layers weights. + process_and_assign_qkv_weight( + awq_internlm, prefix, + tensorrt_llm_internlm.layers[layer_idx].attention.qkv) + + # Attention Dense (out_proj) Linear + mPrefix = prefix + "self_attn.o_proj" + mOp = tensorrt_llm_internlm.layers[layer_idx].attention.dense + process_and_assign_weight(awq_internlm, mPrefix, mOp, 0) + + # MLP up_proj (mlp.gate) Linear + mPrefix = prefix + "mlp.up_proj" + mOp = tensorrt_llm_internlm.layers[layer_idx].mlp.gate + process_and_assign_weight(awq_internlm, mPrefix, mOp, 1) + + # MLP down_proj (mlp.proj) Linear + mPrefix = prefix + "mlp.down_proj" + mOp = tensorrt_llm_internlm.layers[layer_idx].mlp.proj + process_and_assign_weight(awq_internlm, mPrefix, mOp, 0) + + # MLP gate_proj (mlp.fc) Linear + mPrefix = prefix + "mlp.gate_proj" + mOp = tensorrt_llm_internlm.layers[layer_idx].mlp.fc + process_and_assign_weight(awq_internlm, mPrefix, mOp, 1) + + v = awq_internlm['model.norm.weight'] + if mapping.is_last_pp_rank(): + tensorrt_llm_internlm.ln_f.weight.value = v.to( + torch_dtype).cpu().numpy() + + #lm_head + if pad_vocab: + weight = awq_internlm['lm_head.weight'] + [vocab_size, k] = weight.shape + new_weight = torch.zeros([pad_vocab_size, k]) + new_weight[:vocab_size, :] = weight + new_weight = new_weight.T.contiguous() + amax = awq_internlm['lm_head.weight_quantizer._amax'].reshape( + [vocab_size, k // group_size]) + new_amax = torch.ones([pad_vocab_size, k // group_size]) + new_amax[:vocab_size, :] = amax + new_amax = new_amax.T.contiguous() + new_scale = new_amax / 8 + tensorrt_llm_internlm.lm_head.qweight.value = AWQ_quantize_pack_preprocess( + new_weight, new_scale) + tensorrt_llm_internlm.lm_head.scale.value = new_scale.to( + torch_dtype).cpu().numpy() + tensorrt_llm_internlm.lm_head.pre_quant_scale.value = awq_internlm[ + 'lm_head.input_quantizer._pre_quant_scale'].to( + torch_dtype).cpu().numpy() + else: + mPrefix = "lm_head" + mOp = tensorrt_llm_internlm.lm_head + if mapping.is_last_pp_rank(): + process_and_assign_weight(awq_internlm, mPrefix, mOp, 1) + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}') diff --git a/examples/llama/README.md b/examples/llama/README.md index b147e68875..d7df10f86e 100644 --- a/examples/llama/README.md +++ b/examples/llama/README.md @@ -14,7 +14,10 @@ The TensorRT-LLM LLaMA implementation can be found in [tensorrt_llm/models/llama * FP16 * FP8 * INT8 & INT4 Weight-Only + * SmoothQuant + * Groupwise quantization (AWQ/GPTQ) * FP8 KV CACHE + * INT8 KV CACHE (+ AWQ/per-channel weight-only) * Tensor Parallel * STRONGLY TYPED @@ -152,8 +155,18 @@ python build.py --meta_ckpt_dir ./tmp/llama/70B \ Same instructions can be applied to fine-tuned versions of the LLaMA v2 models (e.g. 7Bf or llama-2-7b-chat). -#### INT8 weight only + INT8 KV cache -For INT8 KV cache, [`hf_llama_convert.py`](./hf_llama_convert.py) features a +### Using RoPE Scaling +RoPE scaling is supported through GPT Attention Plugin. You can add `--rotary_scaling ` during the build command to enable it. +- The value of `type` can be either `linear` and `dynamic`. +- The value of `factor` can be any value larger than `1.0`. + +The implementation is identical to Huggingface's. +Please refer to https://huggingface.co/docs/transformers/model_doc/llama2#transformers.LlamaConfig.rope_scaling for more details. + +#### INT8 KV cache +INT8 KV cache could be enabled to reduce memory footprint. It will bring more performance gains when batch size gets larger. + +You can get the INT8 scale of KV cache through [`hf_llama_convert.py`](./hf_llama_convert.py), which features a `--calibrate-kv-cache, -kv` option. Setting `-kv` will calibrate the model, and then export the scaling factors needed for INT8 KV cache inference. @@ -166,9 +179,11 @@ python3 hf_llama_convert.py -i /llama-models/llama-7b-hf -o /llama/smooth_llama_ [`build.py`](./build.py) add new options for the support of INT8 KV cache. -`--int8_kv_cache` is the command-line option to enable INT8 KV cache. +`--int8_kv_cache` is the command-line option to enable INT8 KV cache, and `--ft_model_dir` should contain the directory where the INT8 KV cache scales lie in. -In addition, it could be combined with INT8 weight-only quantization, as follows: +**INT8 KV cache + per-channel weight-only quantization** + +INT8 KV cache could be combined with per-channel weight-only quantization, as follows: Examples of INT8 weight-only quantization + INT8 KV cache @@ -193,6 +208,38 @@ python summarize.py --test_trt_llm \ --test_hf ``` +**INT8 KV cache + AWQ** + +In addition, you can enable INT8 KV cache together with AWQ (per-group INT4 weight-only quantization)like the following command. + +**NOTE**: AWQ checkpoint is passed through `--model_dir`, and the INT8 scales of KV cache is through `--ft_model_dir`. + +```bash +python build.py --model_dir ./tmp/llama/7B/ \ + --quant_ckpt_path ./llama-7b-4bit-gs128-awq.pt \ + --dtype float16 \ + --remove_input_padding \ + --use_gpt_attention_plugin float16 \ + --enable_context_fmha \ + --use_gemm_plugin float16 \ + --use_weight_only \ + --weight_only_precision int4_awq \ + --per_group \ + --output_dir ./tmp/llama/7B/trt_engines/int8_kv_cache_int4_AWQ/1-gpu/ + --int8_kv_cache \ # Turn on INT8 KV cache + --ft_model_dir /llama/smooth_llama_7B/int8_kv_cache/1-gpu/ # Directory to look for INT8 scale of KV cache +``` + +Test with `summarize.py`: + +```bash +python summarize.py --test_trt_llm \ + --hf_model_location /llama-models/llama-7b-hf \ + --data_type fp16 \ + --engine_dir ./tmp/llama/7B/trt_engines/int8_kv_cache_int4_AWQ/1-gpu \ + --test_hf +``` + #### SmoothQuant The smoothquant supports both LLaMA v1 and LLaMA v2. Unlike the FP16 build where the HF weights are processed and loaded into the TensorRT-LLM directly, the SmoothQuant needs to load INT8 weights which should be pre-processed before building an engine. diff --git a/examples/llama/build.py b/examples/llama/build.py index 7351cf541f..5a4b524f52 100644 --- a/examples/llama/build.py +++ b/examples/llama/build.py @@ -32,9 +32,7 @@ from tensorrt_llm.builder import Builder from tensorrt_llm.layers.attention import PositionEmbeddingType from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models import (fp8_quantize, smooth_quantize, - weight_only_groupwise_quantize, - weight_only_quantize) +from tensorrt_llm.models import quantize_model from tensorrt_llm.network import net_guard from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.quantization import QuantMode @@ -340,6 +338,11 @@ def parse_arguments(): action='store_true', help= 'Activates latency-optimized algorithm for all-reduce instead of NCCL.') + parser.add_argument( + '--max_prompt_embedding_table_size', + type=int, + default=0, + help='Setting to a value > 0 enables support for prompt tuning.') args = parser.parse_args() tensorrt_llm.logger.set_level(args.log_level) @@ -372,17 +375,13 @@ def parse_arguments(): args.quant_mode = QuantMode.use_smooth_quant(args.per_token, args.per_channel) elif args.use_weight_only: - if args.per_group: - args.quant_mode = QuantMode.from_description( - quantize_weights=True, - quantize_activations=False, - per_token=False, - per_channel=False, - per_group=True, - use_int4_weights=True) - else: - args.quant_mode = QuantMode.use_weight_only( - args.weight_only_precision == 'int4') + args.quant_mode = QuantMode.from_description( + quantize_weights=True, + quantize_activations=False, + per_token=False, + per_channel=False, + per_group=args.per_group, + use_int4_weights=args.weight_only_precision == "int4") else: args.quant_mode = QuantMode(0) @@ -394,6 +393,7 @@ def parse_arguments(): args.quant_mode = args.quant_mode.set_fp8_qdq() if args.rotary_scaling is not None: + assert args.use_gpt_attention_plugin, "RoPE scaling is only supported through GPT attention plugin." rotary_scaling = { "type": args.rotary_scaling[0], "factor": float(args.rotary_scaling[1]) @@ -401,8 +401,6 @@ def parse_arguments(): assert rotary_scaling["type"] in ["linear", "dynamic"] assert rotary_scaling["factor"] > 1.0 args.rotary_scaling = rotary_scaling - if rotary_scaling["type"] == "dynamic": - assert not args.remove_input_padding, "TODO: Not supported yet" if args.model_dir is not None: hf_config = LlamaConfig.from_pretrained(args.model_dir) @@ -452,9 +450,6 @@ def parse_arguments(): "MQA/GQA requires either the number of K/V heads to be divisible by the tensor parallelism size OR " \ "the tensor parallelism size to be divisible by the number of K/V heads." - if args.dtype == 'bfloat16': - assert args.use_gemm_plugin, "Please use gemm plugin when dtype is bfloat16" - assert args.pp_size * args.tp_size == args.world_size if args.max_num_tokens is not None: @@ -509,47 +504,40 @@ def build_rank_engine(builder: Builder, embedding_sharding_dim=args.embedding_sharding_dim, quant_mode=args.quant_mode, rms_norm_eps=args.rms_norm_eps, - use_fused_mlp=args.use_fused_mlp) - if args.use_smooth_quant: - tensorrt_llm_llama = smooth_quantize(tensorrt_llm_llama, - args.quant_mode) - elif args.use_weight_only: - if args.weight_only_precision == 'int8': - tensorrt_llm_llama = weight_only_quantize(tensorrt_llm_llama, - args.quant_mode) - elif args.weight_only_precision == 'int4': - tensorrt_llm_llama = weight_only_quantize(tensorrt_llm_llama, - args.quant_mode) - elif args.weight_only_precision == 'int4_awq': - tensorrt_llm_llama = weight_only_groupwise_quantize( - model=tensorrt_llm_llama, - quant_mode=args.quant_mode, - group_size=args.group_size, - zero=False, - pre_quant_scale=True, - exclude_modules=[]) + use_fused_mlp=args.use_fused_mlp, + use_prompt_tuning=args.max_prompt_embedding_table_size > 0, + ) + quantize_kwargs = {} + if args.use_smooth_quant or args.use_weight_only: + if args.weight_only_precision == 'int4_awq': + quantize_kwargs = { + "group_size": args.group_size, + "zero": False, + "pre_quant_scale": True, + "exclude_modules": [], + } elif args.weight_only_precision == 'int4_gptq': - tensorrt_llm_llama = weight_only_groupwise_quantize( - model=tensorrt_llm_llama, - quant_mode=args.quant_mode, - group_size=args.group_size, - zero=True, - pre_quant_scale=False) + quantize_kwargs = { + "group_size": args.group_size, + "zero": True, + "pre_quant_scale": False, + } elif args.enable_fp8 or args.fp8_kv_cache: logger.info(f'Loading scaling factors from ' f'{args.quantized_fp8_model_path}') quant_scales = get_scaling_factors(args.quantized_fp8_model_path, num_layers=args.n_layer, quant_mode=args.quant_mode) - tensorrt_llm_llama = fp8_quantize(tensorrt_llm_llama, - quant_mode=args.quant_mode, - quant_scales=quant_scales) + quantize_kwargs = {"quant_scales": quant_scales} + tensorrt_llm_llama = quantize_model(tensorrt_llm_llama, args.quant_mode, + **quantize_kwargs) if args.per_group: load_func = load_from_awq_llama if args.weight_only_precision == 'int4_awq' else load_from_gptq_llama load_func(tensorrt_llm_llama=tensorrt_llm_llama, quant_ckpt_path=args.quant_ckpt_path, mapping=mapping, - dtype=args.dtype) + dtype=args.dtype, + ft_model_dir=args.ft_model_dir) elif args.meta_ckpt_dir is not None: load_from_meta_llama(tensorrt_llm_llama, args.meta_ckpt_dir, mapping, args.dtype) @@ -625,11 +613,15 @@ def build_rank_engine(builder: Builder, network.set_named_parameters(tensorrt_llm_llama.named_parameters()) # Forward - inputs = tensorrt_llm_llama.prepare_inputs(args.max_batch_size, - args.max_input_len, - args.max_output_len, True, - args.max_beam_width, - args.max_num_tokens) + inputs = tensorrt_llm_llama.prepare_inputs( + args.max_batch_size, + args.max_input_len, + args.max_output_len, + True, + args.max_beam_width, + args.max_num_tokens, + prompt_embedding_table_size=args.max_prompt_embedding_table_size, + ) tensorrt_llm_llama(*inputs) if args.enable_debug_output: # mark intermediate nodes' outputs @@ -651,6 +643,9 @@ def build_rank_engine(builder: Builder, if rank == 0: config_path = os.path.join(args.output_dir, 'config.json') builder.save_config(builder_config, config_path) + + tensorrt_llm.tools.cleanup(network, tensorrt_llm_llama) + return engine @@ -690,10 +685,12 @@ def build(rank, args): max_output_len=args.max_output_len, max_num_tokens=args.max_num_tokens, int8=int8_trt_flag, - fp8=args.quant_mode.has_fp8_qdq(), quant_mode=args.quant_mode, strongly_typed=args.strongly_typed, - opt_level=args.builder_opt) + opt_level=args.builder_opt, + max_prompt_embedding_table_size=args. + max_prompt_embedding_table_size, + ) engine_name = get_engine_name(MODEL_NAME, args.dtype, args.tp_size, args.pp_size, cur_rank) engine = build_rank_engine(builder, builder_config, engine_name, @@ -706,6 +703,7 @@ def build(rank, args): cache = builder_config.trt_builder_config.get_timing_cache() serialize_engine(engine, os.path.join(args.output_dir, engine_name)) + del engine if rank == 0: ok = builder.save_timing_cache( diff --git a/examples/llama/hf_llama_convert.py b/examples/llama/hf_llama_convert.py index 3db3b8f350..f16627c30d 100644 --- a/examples/llama/hf_llama_convert.py +++ b/examples/llama/hf_llama_convert.py @@ -44,7 +44,6 @@ def merge_qkv_scales(q_name, hf_model, scales, llama_qkv_para): scales[layer_name_qkv]["x"] = scales[layer_name_q]["x"] scales[layer_name_qkv]["w"] = weight.abs().max(dim=1)[0] - print(scales[layer_name_q]) scales[layer_name_qkv]["y"] = torch.cat([ scales[layer_name_q]["y"], scales[layer_name_k]["y"], scales[layer_name_v]["y"] @@ -188,6 +187,7 @@ def hf_gpt_converter(args): smooth_llama_model(model, act_range, args.smoothquant, llama_qkv_para, llama_smoother) + args.multi_query_mode = model.config.num_attention_heads != model.config.num_key_value_heads config = configparser.ConfigParser() config["llama"] = {} for key in vars(args): @@ -319,9 +319,6 @@ if __name__ == "__main__": type=str, default="fp32", choices=["fp32", "fp16"]) - parser.add_argument("--multi-query-mode", - action="store_true", - help="Use multi-query-attention.") args = parser.parse_args() print("\n=============== Argument ===============") diff --git a/examples/llama/run.py b/examples/llama/run.py index 7c2493735e..2091a9b20c 100644 --- a/examples/llama/run.py +++ b/examples/llama/run.py @@ -19,7 +19,7 @@ from pathlib import Path import numpy as np import torch -from transformers import LlamaTokenizer +from transformers import LlamaTokenizerFast import tensorrt_llm from tensorrt_llm.quantization import QuantMode @@ -51,8 +51,8 @@ def read_config(config_path: Path): world_size = tp_size * pp_size assert world_size == tensorrt_llm.mpi_world_size(), \ f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})' - num_heads = config['builder_config']['num_heads'] // tp_size - hidden_size = config['builder_config']['hidden_size'] // tp_size + num_heads = config['builder_config']['num_heads'] + hidden_size = config['builder_config']['hidden_size'] vocab_size = config['builder_config']['vocab_size'] num_layers = config['builder_config']['num_layers'] num_kv_heads = config['builder_config'].get('num_kv_heads', num_heads) @@ -65,21 +65,28 @@ def read_config(config_path: Path): ) num_kv_heads = 1 num_kv_heads = (num_kv_heads + tp_size - 1) // tp_size + assert (num_heads % tp_size) == 0 + num_heads = num_heads // tp_size + hidden_size = hidden_size // tp_size use_custom_all_reduce = config['plugin_config'].get('use_custom_all_reduce', False) + max_prompt_embedding_table_size = config['builder_config'].get( + 'max_prompt_embedding_table_size', 0) - model_config = ModelConfig(num_heads=num_heads, - num_kv_heads=num_kv_heads, - hidden_size=hidden_size, - vocab_size=vocab_size, - num_layers=num_layers, - gpt_attention_plugin=use_gpt_attention_plugin, - paged_kv_cache=paged_kv_cache, - tokens_per_block=tokens_per_block, - remove_input_padding=remove_input_padding, - dtype=dtype, - quant_mode=quant_mode, - use_custom_all_reduce=use_custom_all_reduce) + model_config = ModelConfig( + num_heads=num_heads, + num_kv_heads=num_kv_heads, + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + gpt_attention_plugin=use_gpt_attention_plugin, + paged_kv_cache=paged_kv_cache, + tokens_per_block=tokens_per_block, + remove_input_padding=remove_input_padding, + dtype=dtype, + quant_mode=quant_mode, + use_custom_all_reduce=use_custom_all_reduce, + max_prompt_embedding_table_size=max_prompt_embedding_table_size) return model_config, tp_size, pp_size, dtype @@ -121,6 +128,37 @@ def parse_input(input_text: str, input_file: str, tokenizer, end_id: int, return input_ids, input_lengths +def ptuning_setup(prompt_table, dtype, hidden_size, tasks, input_ids, + input_lengths, remove_input_padding): + if prompt_table is not None: + prompt_table = torch.from_numpy(np.load(prompt_table)) + task_vocab_size = torch.tensor([prompt_table.shape[1]], + dtype=torch.int32, + device="cuda") + prompt_table = prompt_table.view( + (prompt_table.shape[0] * prompt_table.shape[1], + prompt_table.shape[2])) + prompt_table = prompt_table.cuda().to( + dtype=tensorrt_llm._utils.str_dtype_to_torch(dtype)) + else: + prompt_table = torch.empty([1, hidden_size]).cuda() + task_vocab_size = torch.zeros([1]).cuda() + + num_sequences = input_lengths.size( + 0) if remove_input_padding else input_ids.size(0) + + if tasks is not None: + tasks = torch.tensor([int(t) for t in tasks.split(',')], + dtype=torch.int32, + device="cuda") + assert tasks.shape[ + 0] == num_sequences, "Number of supplied tasks must match input batch size" + else: + tasks = torch.zeros([num_sequences]).cuda() + + return [prompt_table, tasks, task_vocab_size] + + def print_output(output_ids, input_lengths, max_output_len, tokenizer, output_csv, output_npy, sequence_lengths): num_beams = output_ids.size(1) @@ -138,6 +176,7 @@ def print_output(output_ids, input_lengths, max_output_len, tokenizer, print(f'Output: \"{output_text}\"') output_ids = output_ids.reshape((-1, output_ids.size(2))) + print(output_ids) if output_csv is not None: output_file = Path(output_csv) @@ -190,6 +229,13 @@ def parse_arguments(): type=int, help="How often to return tokens when streaming.", default=5) + parser.add_argument( + '--prompt_table', + type=Path, + help="Path to .npy file, exported by nemo_prompt_convert.py") + parser.add_argument( + '--tasks', + help="Comma-separated list of tasks for prompt tuning: ex 0,3,1,0") return parser.parse_args() @@ -205,6 +251,8 @@ def generate( num_beams: int = 1, streaming: bool = False, streaming_interval: int = 5, + prompt_table: Path = None, + tasks: str = None, ): tensorrt_llm.logger.set_level(log_level) @@ -220,7 +268,7 @@ def generate( pp_size=pp_size) torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node) - tokenizer = LlamaTokenizer.from_pretrained(tokenizer_dir, legacy=False) + tokenizer = LlamaTokenizerFast.from_pretrained(tokenizer_dir, legacy=False) sampling_config = SamplingConfig(end_id=EOS_TOKEN, pad_id=PAD_TOKEN, @@ -242,14 +290,20 @@ def generate( input_ids, input_lengths = parse_input(input_text, input_file, tokenizer, EOS_TOKEN, model_config.remove_input_padding) + print(input_ids) max_input_length = torch.max(input_lengths).item() decoder.setup(input_lengths.size(0), max_input_length, max_output_len, num_beams) + ptuning_args = [] if model_config.max_prompt_embedding_table_size == 0 else ptuning_setup( + prompt_table, dtype, model_config.hidden_size, tasks, input_ids, + input_lengths, model_config.remove_input_padding) + outputs = decoder.decode(input_ids, input_lengths, sampling_config, + *ptuning_args, streaming=streaming, output_sequence_lengths=True, return_dict=True) diff --git a/examples/llama/weight.py b/examples/llama/weight.py index 37e9e81505..d347c341e0 100644 --- a/examples/llama/weight.py +++ b/examples/llama/weight.py @@ -223,6 +223,11 @@ def load_from_hf_llama(tensorrt_llm_llama: tensorrt_llm.models.LLaMAForCausalLM, else: v = torch_to_numpy(v.to(torch_dtype).detach().cpu()) if 'model.embed_tokens.weight' in k: + if hf_llama.config.tie_word_embeddings: + # lm_head.weight has the same weights as embedding + if mapping.is_last_pp_rank(): + tensorrt_llm_llama.lm_head.weight.value = np.ascontiguousarray( + split(v, mapping.tp_size, mapping.tp_rank)) if tensorrt_llm_llama.use_parallel_embedding: v = split(v, mapping.tp_size, mapping.tp_rank, tensorrt_llm_llama.embedding_sharding_dim) @@ -818,7 +823,8 @@ def load_from_binary(tensorrt_llm_llama: LLaMAForCausalLM, def load_from_gptq_llama(tensorrt_llm_llama, quant_ckpt_path, mapping=Mapping(), - dtype="float16"): + dtype="float16", + ft_model_dir=None): tensorrt_llm.logger.info( 'Loading weights from groupwise GPTQ LLaMA safetensors...') tik = time.time() @@ -1019,7 +1025,8 @@ def load_from_gptq_llama(tensorrt_llm_llama, def load_from_awq_llama(tensorrt_llm_llama: LLaMAForCausalLM, quant_ckpt_path, mapping=Mapping(), - dtype="float16"): + dtype="float16", + ft_model_dir=None): tensorrt_llm.logger.info( 'Loading weights from groupwise AWQ LLaMA safetensors...') tik = time.time() @@ -1052,12 +1059,23 @@ def load_from_awq_llama(tensorrt_llm_llama: LLaMAForCausalLM, "post_layernorm.weight", ] - getattr(tensorrt_llm_llama, 'quant_mode', QuantMode(0)) + quant_mode = getattr(tensorrt_llm_llama, 'quant_mode', QuantMode(0)) + # Int8 KV cache + use_int8_kv_cache = quant_mode.has_int8_kv_cache() packer = torch.ops.fastertransformer.pack_int8_tensor_to_packed_int4 preprocessor = torch.ops.fastertransformer.preprocess_weights_for_mixed_gemm torch_dtype = str_dtype_to_torch(dtype) + def fromfile(dir_path, name, shape=None, dtype=None): + p = dir_path + '/' + name + if Path(p).exists(): + t = np.fromfile(p, dtype=dtype) + if shape is not None: + t = t.reshape(shape) + return t + return None + def AWQ_quantize_pack_preprocess(weight, scale): scale = scale.repeat_interleave(group_size, dim=0) weight = weight / scale @@ -1217,6 +1235,18 @@ def load_from_awq_llama(tensorrt_llm_llama: LLaMAForCausalLM, mOp = tensorrt_llm_llama.layers[layer_idx].mlp.fc process_and_assign_weight(awq_llama, mPrefix, mOp, 1) + if use_int8_kv_cache: + assert ft_model_dir, "You must pass --ft_model_dir to tell TRT-LLM where to look for scales of INT8 kv cache." + t = fromfile( + ft_model_dir, 'model.layers.' + str(layer_idx) + + '.attention.query_key_value.scale_y_quant_orig.bin', [1], + np.float32) + assert t is not None, f"{ft_model_dir} does not contain model.layers.{layer_idx}.attention.query_key_value.scale_y_quant_orig.bin" + tensorrt_llm_llama.layers[ + layer_idx].attention.kv_orig_quant_scale.value = 1.0 / t + tensorrt_llm_llama.layers[ + layer_idx].attention.kv_quant_orig_scale.value = t + v = awq_llama['model.norm.weight'] if mapping.is_last_pp_rank(): tensorrt_llm_llama.ln_f.weight.value = v.to(torch_dtype).cpu().numpy() diff --git a/examples/mpt/build.py b/examples/mpt/build.py index f947a8d83b..2f88cd320d 100644 --- a/examples/mpt/build.py +++ b/examples/mpt/build.py @@ -26,7 +26,7 @@ from tensorrt_llm.builder import Builder from tensorrt_llm.layers import PositionEmbeddingType from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models import smooth_quantize, weight_only_quantize +from tensorrt_llm.models import quantize_model from tensorrt_llm.network import net_guard from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.quantization import QuantMode @@ -429,11 +429,9 @@ def build_rank_engine(builder: Builder, use_parallel_embedding=args.use_parallel_embedding, embedding_sharding_dim=args.embedding_sharding_dim, share_embedding_table=share_embedding_table) - if args.use_smooth_quant: - tensorrt_llm_gpt = smooth_quantize(tensorrt_llm_gpt, args.quant_mode) - elif args.use_weight_only: - tensorrt_llm_gpt = weight_only_quantize(tensorrt_llm_gpt, - args.quant_mode) + + if args.use_smooth_quant or args.use_weight_only: + tensorrt_llm_gpt = quantize_model(tensorrt_llm_gpt, args.quant_mode) if args.model_dir is not None: gpt_dummy_fp8_scaling_factors = { @@ -528,6 +526,9 @@ def build_rank_engine(builder: Builder, if rank == 0: config_path = args.output_dir / 'config.json' builder.save_config(builder_config, config_path) + + tensorrt_llm.tools.cleanup(network, tensorrt_llm_gpt) + return engine @@ -569,7 +570,7 @@ def build(rank, args): multi_query_mode=args.multi_query_mode, strongly_typed=args.strongly_typed, use_prompt_tuning=args.max_prompt_embedding_table_size > 0, - fp8=args.enable_fp8, + quant_mode=args.quant_mode, use_parallel_embedding=args.use_parallel_embedding) engine_name = get_engine_name(MODEL_NAME, args.dtype, args.world_size, @@ -585,6 +586,7 @@ def build(rank, args): ) serialize_engine(engine, args.output_dir / engine_name) + del engine if rank == 0: ok = builder.save_timing_cache(builder_config, timing_cache_file) diff --git a/examples/mpt/requirements.txt b/examples/mpt/requirements.txt index 61be4accb8..f46bff3100 100644 --- a/examples/mpt/requirements.txt +++ b/examples/mpt/requirements.txt @@ -1,2 +1,2 @@ -datasets~=2.3.2 +datasets~=2.14.5 rouge_score~=0.1.2 diff --git a/examples/mpt/run.py b/examples/mpt/run.py index 5ad7ab361b..ff7b88a610 100644 --- a/examples/mpt/run.py +++ b/examples/mpt/run.py @@ -42,7 +42,6 @@ def read_config(config_path: Path): multi_query_mode = config['builder_config']['multi_query_mode'] paged_kv_cache = config['plugin_config']['paged_kv_cache'] tokens_per_block = config['plugin_config']['tokens_per_block'] - use_prompt_tuning = config['builder_config']['use_prompt_tuning'] num_kv_heads = 1 if multi_query_mode else num_heads dtype = config['builder_config']['precision'] @@ -55,13 +54,13 @@ def read_config(config_path: Path): remove_input_padding=remove_input_padding, paged_kv_cache=paged_kv_cache, tokens_per_block=tokens_per_block, - use_prompt_tuning=use_prompt_tuning, dtype=dtype) dtype = config['builder_config']['precision'] max_input_len = config['builder_config']['max_input_len'] + use_prompt_tuning = config['builder_config']['use_prompt_tuning'] - return model_config, world_size, dtype, max_input_len + return model_config, world_size, dtype, max_input_len, use_prompt_tuning def parse_input(input_text: str, input_file: str, tokenizer, pad_id: int, @@ -234,7 +233,8 @@ def generate( engine_dir = Path(engine_dir) config_path = engine_dir / 'config.json' - model_config, world_size, dtype, max_input_len = read_config(config_path) + model_config, world_size, dtype, max_input_len, use_prompt_tuning = read_config( + config_path) runtime_rank = tensorrt_llm.mpi_rank() runtime_mapping = tensorrt_llm.Mapping(world_size, @@ -284,7 +284,7 @@ def generate( max_output_len, beam_width=num_beams) - ptuning_args = [] if not model_config.use_prompt_tuning else ptuning_setup( + ptuning_args = [] if not use_prompt_tuning else ptuning_setup( prompt_table, dtype, model_config.hidden_size, tasks, input_ids, input_lengths, model_config.remove_input_padding) diff --git a/examples/opt/build.py b/examples/opt/build.py index 2bcd53f81d..7f974800d9 100644 --- a/examples/opt/build.py +++ b/examples/opt/build.py @@ -25,7 +25,7 @@ from tensorrt_llm._utils import str_dtype_to_trt from tensorrt_llm.builder import Builder from tensorrt_llm.logger import logger from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models import weight_only_quantize +from tensorrt_llm.models import quantize_model from tensorrt_llm.network import net_guard from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.quantization import QuantMode @@ -251,8 +251,8 @@ def build_rank_engine(builder: Builder, embedding_sharding_dim=args.embedding_sharding_dim, share_embedding_table=share_embedding_table) if args.use_weight_only: - tensorrt_llm_gpt = weight_only_quantize(tensorrt_llm_gpt, - args.quant_mode) + tensorrt_llm_gpt = quantize_model(tensorrt_llm_gpt, args.quant_mode) + if args.model_dir is not None: load_from_ft(tensorrt_llm_gpt, args.model_dir, @@ -313,6 +313,9 @@ def build_rank_engine(builder: Builder, if rank == 0: config_path = os.path.join(args.output_dir, 'config.json') builder.save_config(builder_config, config_path) + + tensorrt_llm.tools.cleanup(network, tensorrt_llm_gpt) + return engine @@ -359,6 +362,7 @@ def build(rank, args): cache = builder_config.trt_builder_config.get_timing_cache() serialize_engine(engine, os.path.join(args.output_dir, engine_name)) + del engine if rank == 0: ok = builder.save_timing_cache( diff --git a/requirements-dev-windows.txt b/requirements-dev-windows.txt index b7cd8dcb7c..5fa7659a9b 100644 --- a/requirements-dev-windows.txt +++ b/requirements-dev-windows.txt @@ -1,9 +1,9 @@ ---pre --extra-index-url https://download.pytorch.org/whl/nightly/cu121 -# torch is CPU-only on Windows, so need to specify a torch version with GPU support -torch==2.1.0.dev20230828+cu121 -torchvision==0.16.0.dev20230828 -torchdata==0.7.0.dev20230828 -torchtext==0.16.0.dev20230828 +--extra-index-url https://download.pytorch.org/whl/cu121 +# Default torch is CPU-only on Windows, so need to specify a torch version with GPU support +torch==2.1.0+cu121 +torchvision==0.16.0+cu121 +torchdata==0.7.0 +torchtext==0.16.0+cpu tokenizers==0.13.3 transformers==4.33.1 diffusers==0.15.0 @@ -23,4 +23,4 @@ einops parameterized graphviz pywin32 -pynvml +pynvml>=11.5.0 diff --git a/requirements-dev.txt b/requirements-dev.txt index 86e4fae049..02536864b2 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -17,4 +17,4 @@ pre-commit einops parameterized graphviz -pynvml +pynvml>=11.5.0 diff --git a/requirements-windows.txt b/requirements-windows.txt index a904da9ff8..2c1de9a1a3 100644 --- a/requirements-windows.txt +++ b/requirements-windows.txt @@ -1,10 +1,10 @@ build ---pre --extra-index-url https://download.pytorch.org/whl/nightly/cu121 -# torch is CPU-only on Windows, so need to specify a torch version with GPU support -torch==2.1.0.dev20230828+cu121 -torchvision==0.16.0.dev20230828 -torchdata==0.7.0.dev20230828 -torchtext==0.16.0.dev20230828 +--extra-index-url https://download.pytorch.org/whl/cu121 +# Default torch is CPU-only on Windows, so need to specify a torch version with GPU support +torch==2.1.0+cu121 +torchvision==0.16.0+cu121 +torchdata==0.7.0 +torchtext==0.16.0+cpu tokenizers==0.13.3 transformers==4.33.1 diffusers==0.15.0 diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py index 1b567710c5..d2af07eddc 100755 --- a/scripts/build_wheel.py +++ b/scripts/build_wheel.py @@ -51,7 +51,8 @@ def main(build_type: str = "Release", use_ccache: bool = False, cpp_only: bool = False, install: bool = False, - skip_building_wheel: bool = False): + skip_building_wheel: bool = False, + python_bindings: bool = False): project_dir = Path(__file__).parent.resolve().parent os.chdir(project_dir) build_run = partial(run, shell=True, check=True) @@ -142,22 +143,28 @@ def main(build_type: str = "Release", build_pyt = "OFF" if cpp_only else "ON" th_common_lib = "" if cpp_only else "th_common" + build_pybind = "ON" if python_bindings else "OFF" + bindings_lib = "bindings" if python_bindings else "" with working_directory(build_dir): cmake_def_args = " ".join(cmake_def_args) if clean or first_build: build_run( - f'cmake -DCMAKE_BUILD_TYPE="{build_type}" -DBUILD_PYT="{build_pyt}" {cmake_cuda_architectures}' - f' {cmake_def_args} -S "{source_dir}"') + f'cmake -DCMAKE_BUILD_TYPE="{build_type}" -DBUILD_PYT="{build_pyt}" -DBUILD_PYBIND="{build_pybind}"' + f' {cmake_cuda_architectures} {cmake_def_args} -S "{source_dir}"' + ) build_run( - f'cmake --build . --config {build_type} --parallel {job_count} --target tensorrt_llm tensorrt_llm_static nvinfer_plugin_tensorrt_llm {th_common_lib} ' + f'cmake --build . --config {build_type} --parallel {job_count} ' + f'--target tensorrt_llm tensorrt_llm_static nvinfer_plugin_tensorrt_llm {th_common_lib} {bindings_lib}' f'{" ".join(extra_make_targets)}') if cpp_only: assert not install, "Installing is not supported for cpp_only builds" return - lib_dir = project_dir / "tensorrt_llm/libs" + pkg_dir = project_dir / "tensorrt_llm" + assert pkg_dir.is_dir(), f"{pkg_dir} is not a directory" + lib_dir = pkg_dir / "libs" if lib_dir.exists(): rmtree(lib_dir) lib_dir.mkdir(parents=True) @@ -176,6 +183,15 @@ def main(build_type: str = "Release", "tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so", lib_dir / "libnvinfer_plugin_tensorrt_llm.so") + if python_bindings: + # TODO Add windows support for python bindings. + pybind_lib = list( + (build_dir / "tensorrt_llm" / "pybind").glob("bindings.*.so")) + assert len( + pybind_lib + ) == 1, f"Exactly one pybind library should be present: {pybind_lib}" + copy(pybind_lib[0], pkg_dir) + if dist_dir is None: dist_dir = project_dir / "build" else: @@ -244,5 +260,9 @@ if __name__ == "__main__": action="store_true", help= "Do not build the *.whl files (they are only needed for distribution).") + parser.add_argument("--python_bindings", + "-p", + action="store_true", + help="Build the python bindings for the C++ runtime.") args = parser.parse_args() main(**vars(args)) diff --git a/setup.py b/setup.py index bf4e055f45..1644c422fa 100644 --- a/setup.py +++ b/setup.py @@ -56,13 +56,16 @@ setup( install_requires=required_deps, dependency_links=extra_URLs, zip_safe=True, + license="Apache License 2.0", packages=find_packages(), + # TODO Add windows support for python bindings. package_data={ 'tensorrt_llm': (['libs/th_common.dll', 'libs/nvinfer_plugin_tensorrt_llm.dll'] - if platform.system() == "Windows" else - ['libs/libth_common.so', 'libs/libnvinfer_plugin_tensorrt_llm.so']) + - ['tools/plugin_gen/templates/*'] + if platform.system() == "Windows" else [ + 'libs/libth_common.so', 'libs/libnvinfer_plugin_tensorrt_llm.so', + 'bindings.*.so' + ]) + ['tools/plugin_gen/templates/*'], }, python_requires=">=3.7, <4", distclass=BinaryDistribution, diff --git a/tensorrt_llm/_common.py b/tensorrt_llm/_common.py index 9b8ccb12d4..01bbc72017 100644 --- a/tensorrt_llm/_common.py +++ b/tensorrt_llm/_common.py @@ -13,9 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. import contextlib +import ctypes import platform +import time from pathlib import Path +import numpy as np +import tensorrt as trt import torch from ._utils import str_dtype_to_trt @@ -80,3 +84,79 @@ def precision(dtype): prev_dtype = switch_net_dtype(dtype) yield switch_net_dtype(prev_dtype) + + +def serialize_engine(engine, path): + logger.info(f'Serializing engine to {path}...') + tik = time.time() + if isinstance(engine, trt.ICudaEngine): + engine = engine.serialize() + with open(path, 'wb') as f: + f.write(bytearray(engine)) + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + logger.info(f'Engine serialized. Total time: {t}') + + +def deserialize_engine(path): + runtime = trt.Runtime(logger.trt_logger) + with open(path, 'rb') as f: + logger.info(f'Loading engine from {path}...') + tik = time.time() + + engine = runtime.deserialize_cuda_engine(f.read()) + assert engine is not None + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + logger.info(f'Engine loaded. Total time: {t}') + return engine + + +_field_dtype_to_np_dtype_dict = { + trt.PluginFieldType.FLOAT16: np.float16, + trt.PluginFieldType.FLOAT32: np.float32, + trt.PluginFieldType.FLOAT64: np.float64, + trt.PluginFieldType.INT8: np.int8, + trt.PluginFieldType.INT16: np.int16, + trt.PluginFieldType.INT32: np.int32, +} + + +def field_dtype_to_np_dtype(dtype): + ret = _field_dtype_to_np_dtype_dict.get(dtype) + assert ret is not None, f'Unsupported dtype: {dtype}' + return ret + + +def convert_capsule_to_void_p(capsule): + ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p + ctypes.pythonapi.PyCapsule_GetPointer.argtypes = [ + ctypes.py_object, ctypes.c_char_p + ] + return ctypes.pythonapi.PyCapsule_GetPointer(capsule, None) + + +def get_nparray_from_void_p(void_pointer, elem_size, field_dtype): + ctypes.pythonapi.PyMemoryView_FromMemory.restype = ctypes.py_object + ctypes.pythonapi.PyMemoryView_FromMemory.argtypes = [ + ctypes.c_char_p, ctypes.c_ssize_t, ctypes.c_int + ] + logger.info( + f'get_nparray: pointer = {void_pointer}, elem_size = {elem_size}') + char_pointer = ctypes.cast(void_pointer, ctypes.POINTER(ctypes.c_char)) + np_dtype = field_dtype_to_np_dtype(field_dtype) + buf_bytes = elem_size * np.dtype(np_dtype).itemsize + logger.info(f'get_nparray: buf_bytes = {buf_bytes}') + mem_view = ctypes.pythonapi.PyMemoryView_FromMemory( + char_pointer, buf_bytes, 0) # number 0 represents PyBUF_READ + logger.info( + f'get_nparray: mem_view = {mem_view}, field_dtype = {field_dtype}') + buf = np.frombuffer(mem_view, np_dtype) + return buf + + +def get_scalar_from_field(field): + void_p = convert_capsule_to_void_p(field.data) + np_array = get_nparray_from_void_p(void_p, 1, field.type) + return np_array[0] diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py index 91a695ac30..b5f0ae4193 100644 --- a/tensorrt_llm/_utils.py +++ b/tensorrt_llm/_utils.py @@ -13,26 +13,25 @@ # See the License for the specific language governing permissions and # limitations under the License. import copy -import ctypes import json import math -import time +import struct from functools import partial import numpy as np import tensorrt as trt import torch -from .logger import logger - # numpy doesn't know bfloat16, define abstract binary type instead np_bfloat16 = np.dtype('V2', metadata={"dtype": "bfloat16"}) -def torch_to_numpy(x): +def torch_to_numpy(x: torch.Tensor): + assert isinstance(x, torch.Tensor), \ + f'x must be a torch.Tensor object, but got {type(x)}.' if x.dtype != torch.bfloat16: - return x.numpy() - return x.view(torch.int16).numpy().view(np_bfloat16) + return x.cpu().numpy() + return x.view(torch.int16).cpu().numpy().view(np_bfloat16) fp32_array = partial(np.array, dtype=np.float32) @@ -192,33 +191,6 @@ def dim_resolve_negative(dim, ndim): return tuple(pos) -def serialize_engine(engine, path): - logger.info(f'Serializing engine to {path}...') - tik = time.time() - if isinstance(engine, trt.ICudaEngine): - engine = engine.serialize() - with open(path, 'wb') as f: - f.write(bytearray(engine)) - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logger.info(f'Engine serialized. Total time: {t}') - - -def deserialize_engine(path): - runtime = trt.Runtime(logger.trt_logger) - with open(path, 'rb') as f: - logger.info(f'Loading engine from {path}...') - tik = time.time() - - engine = runtime.deserialize_cuda_engine(f.read()) - assert engine is not None - - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logger.info(f'Engine loaded. Total time: {t}') - return engine - - def mpi_comm(): from mpi4py import MPI return MPI.COMM_WORLD @@ -251,50 +223,16 @@ def to_json_file(obj, json_file_path): writer.write(to_json_string(obj)) -_field_dtype_to_np_dtype_dict = { - trt.PluginFieldType.FLOAT16: np.float16, - trt.PluginFieldType.FLOAT32: np.float32, - trt.PluginFieldType.FLOAT64: np.float64, - trt.PluginFieldType.INT8: np.int8, - trt.PluginFieldType.INT16: np.int16, - trt.PluginFieldType.INT32: np.int32, -} +def numpy_fp32_to_bf16(src): + # Numpy doesn't support bfloat16 type + # Convert float32 to bfloat16 manually and assign with bf16 abstract type + original_shape = src.shape + src = src.flatten() + src = np.ascontiguousarray(src) - -def field_dtype_to_np_dtype(dtype): - ret = _field_dtype_to_np_dtype_dict.get(dtype) - assert ret is not None, f'Unsupported dtype: {dtype}' - return ret - - -def convert_capsule_to_void_p(capsule): - ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p - ctypes.pythonapi.PyCapsule_GetPointer.argtypes = [ - ctypes.py_object, ctypes.c_char_p - ] - return ctypes.pythonapi.PyCapsule_GetPointer(capsule, None) - - -def get_nparray_from_void_p(void_pointer, elem_size, field_dtype): - ctypes.pythonapi.PyMemoryView_FromMemory.restype = ctypes.py_object - ctypes.pythonapi.PyMemoryView_FromMemory.argtypes = [ - ctypes.c_char_p, ctypes.c_ssize_t, ctypes.c_int - ] - logger.info( - f'get_nparray: pointer = {void_pointer}, elem_size = {elem_size}') - char_pointer = ctypes.cast(void_pointer, ctypes.POINTER(ctypes.c_char)) - np_dtype = field_dtype_to_np_dtype(field_dtype) - buf_bytes = elem_size * np.dtype(np_dtype).itemsize - logger.info(f'get_nparray: buf_bytes = {buf_bytes}') - mem_view = ctypes.pythonapi.PyMemoryView_FromMemory( - char_pointer, buf_bytes, 0) # number 0 represents PyBUF_READ - logger.info( - f'get_nparray: mem_view = {mem_view}, field_dtype = {field_dtype}') - buf = np.frombuffer(mem_view, np_dtype) - return buf - - -def get_scalar_from_field(field): - void_p = convert_capsule_to_void_p(field.data) - np_array = get_nparray_from_void_p(void_p, 1, field.type) - return np_array[0] + assert src.dtype == np.float32 + dst = np.empty_like(src, dtype=np.uint16) + for i in range(len(dst)): + bytes = struct.pack(' BuilderConfig: @@ -114,6 +115,7 @@ class Builder(): ''' self.strongly_typed = strongly_typed + quant_mode = kwargs.get("quant_mode", QuantMode(0)) if not strongly_typed and precision not in self._ALLOWED_PRECISIONS: logger.error( f"precision should be one of {self._ALLOWED_PRECISIONS}") @@ -125,6 +127,8 @@ class Builder(): config = self.trt_builder.create_builder_config() if not strongly_typed: + fp8 = quant_mode.has_fp8_qdq() or quant_mode.has_fp8_kv_cache() + if precision == 'float16': config.set_flag(trt.BuilderFlag.FP16) config.set_flag(trt.BuilderFlag.OBEY_PRECISION_CONSTRAINTS) @@ -173,7 +177,6 @@ class Builder(): tensor_parallel=tensor_parallel, use_refit=use_refit, int8=int8, - fp8=fp8, **kwargs) def _add_optimization_profile(self, network: Network, diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py index 4c6efcae2d..23dd159d43 100644 --- a/tensorrt_llm/functional.py +++ b/tensorrt_llm/functional.py @@ -1772,7 +1772,7 @@ def _lookup_plugin(input: Tensor, weight: Tensor, rank: int) -> Tensor: Parameters: input : Tensor - The input tensor the contains the indices to perform the lookup. + The input tensor contains the indices to perform the lookup. weight : Tensor The table to gather from. diff --git a/tensorrt_llm/layers/attention.py b/tensorrt_llm/layers/attention.py index c2ac95b807..c7cddac874 100644 --- a/tensorrt_llm/layers/attention.py +++ b/tensorrt_llm/layers/attention.py @@ -19,9 +19,10 @@ import numpy as np import tensorrt as trt from .._common import default_net, precision +from .._utils import numpy_fp32_to_bf16 from ..functional import (AttentionMaskType, PositionEmbeddingType, RotaryScalingType, Tensor, bert_attention, cast, clip, - concat, constant, expand_dims, expand_mask, + concat, constant, embedding, expand_dims, expand_mask, generate_alibi_biases, generate_alibi_slopes, gpt_attention, matmul, repeat_interleave, round, shape, slice, softmax, split, view, where) @@ -48,7 +49,7 @@ class RopeEmbeddingUtils: dtype=np.float32) concat = np.concatenate((np.sin(sinusoid_inp), np.cos(sinusoid_inp)), axis=1) - return np.expand_dims(concat, axis=0) + return np.expand_dims(concat, axis=0).astype(np.float32) @staticmethod def rotate_every_two(tensor: Tensor) -> Tensor: @@ -140,6 +141,83 @@ class RopeEmbeddingUtils: raise ValueError('The PositionEmbeddingType is not RoPE') return (tensor * cos) + (rotate_func(tensor) * sin) + @staticmethod + def apply_rotary_pos_emb_chatglm( + qkv, + position_embedding, + num_attention_heads, + attention_head_size, + max_position_embeddings, + ) -> Tensor: + + half_head_size = attention_head_size // 2 + qkv_shape = shape(qkv) + qkv = qkv.view( + concat([ + shape(qkv, 0), + shape(qkv, 1), + num_attention_heads, + 3, + attention_head_size, + ])) + query, key, value = split(qkv, 1, dim=3) + q_shape = concat([ + shape(qkv, 0), + shape(qkv, 1), + num_attention_heads, + attention_head_size, + ]) + query = query.view(q_shape) + key = key.view(q_shape) + value = value.view(q_shape) + + embedding_weight = RopeEmbeddingUtils.create_sinusoidal_positions( + max_position_embeddings, half_head_size) + embedding_weight = np.split(embedding_weight.squeeze(0), 2, axis=1) + embedding_weight = np.concatenate( + [ + embedding_weight[0], + embedding_weight[0], + embedding_weight[1], + embedding_weight[1], + ], + axis=1, + ) + + embedding_weight = constant(embedding_weight) + position_embedding = embedding(position_embedding, embedding_weight) + position_embedding, block_embedding = split( + position_embedding, + 1, + dim=1, + ) + sin0, cos0 = split(position_embedding, half_head_size, dim=3) + sin1, cos1 = split(block_embedding, half_head_size, dim=3) + + new_shape = concat([ + shape(qkv, 0), + shape(qkv, 1), + 1, + half_head_size, + ]) + position_embedding = [ + tensor.view(new_shape) for tensor in [cos0, cos1, sin0, sin1] + ] + + query = RopeEmbeddingUtils.apply_rotary_pos_emb( + tensor=query, + position_embedding=position_embedding, + pos_emb_type=PositionEmbeddingType.chatglm) + key = RopeEmbeddingUtils.apply_rotary_pos_emb( + tensor=key, + position_embedding=position_embedding, + pos_emb_type=PositionEmbeddingType.chatglm) + + qkv = concat([query, key, value], dim=2) + qkv = qkv.view(qkv_shape) + + return qkv + class AttentionParams(object): @@ -381,6 +459,7 @@ class Attention(Module): encoder_output: Optional[Tensor] = None, workspace=None, position_embedding=None, + norm_before_bmm1=False, ): assert isinstance(hidden_states, Tensor) @@ -399,36 +478,15 @@ class Attention(Module): alibi_scale=alibi_scale) qkv = self.qkv(hidden_states) + if self.position_embedding_type == PositionEmbeddingType.chatglm: - qkv = qkv.view( - concat([ - shape(qkv, 0), - shape(qkv, 1), self.num_attention_heads, 3, - self.attention_head_size - ])) - query, key, value = split(qkv, 1, dim=3) - q_shape = concat([ - shape(qkv, 0), - shape(qkv, 1), self.num_attention_heads, - self.attention_head_size - ]) - query = query.view(q_shape) - key = key.view(q_shape) - value = value.view(q_shape) - - query = RopeEmbeddingUtils.apply_rotary_pos_emb( - query, - position_embedding=position_embedding, - pos_emb_type=PositionEmbeddingType.chatglm) - key = RopeEmbeddingUtils.apply_rotary_pos_emb( - key, - position_embedding=position_embedding, - pos_emb_type=PositionEmbeddingType.chatglm) - - qkv = concat([query, key, value], dim=2) - qkv = qkv.view( - concat([shape(qkv, 0), - shape(qkv, 1), self.hidden_size * 3])) + qkv = RopeEmbeddingUtils.apply_rotary_pos_emb_chatglm( + qkv, + position_embedding, + self.num_attention_heads, + self.attention_head_size, + self.max_position_embeddings, + ) paged_kv_cache = default_net().plugin_config.paged_kv_cache @@ -548,7 +606,12 @@ class Attention(Module): value = transpose_for_scores(value, is_kv=True) if self.rotary_enabled: - embed_positions = constant(self.embed_positions) + if self.dtype == trt.bfloat16: + embed_positions = numpy_fp32_to_bf16( + self.embed_positions.astype(np.float32)) + embed_positions = constant(embed_positions) + else: + embed_positions = constant(self.embed_positions) if self.rotary_embedding_dim is not None: # When shape(hidden_states, 1) > 1(Context phase), the embedding start from 0, @@ -726,10 +789,13 @@ class Attention(Module): key = key.permute([0, 1, 3, 2]) with precision('float32'): + if norm_before_bmm1: + # Apply norm on query earlier to prevent matmul fp16 overflow. + query /= self.norm_factor attention_scores = matmul(cast(query, 'float32'), cast(key, 'float32')) - - attention_scores = attention_scores / self.norm_factor + if not norm_before_bmm1: + attention_scores = attention_scores / self.norm_factor if self.attention_mask_type in [ AttentionMaskType.causal, diff --git a/tensorrt_llm/layers/embedding.py b/tensorrt_llm/layers/embedding.py index 8e5be06e7b..d742902bdf 100644 --- a/tensorrt_llm/layers/embedding.py +++ b/tensorrt_llm/layers/embedding.py @@ -74,8 +74,11 @@ class Embedding(Module): class PromptTuningEmbedding(Embedding): """ - Pass all tokens though both normal and prompt embedding tables. - Then, combine results based on whether the token was "normal" or "prompt/virtual". + PromptTuningEmbedding handles fine-tuned prompts with virtual tokens. At runtime, + a supplementary embedding dictionary is passed. Tokens whose ids are >= vocab_size are embedded + with that additional dictionary. + The prompt tuning dictionary holds multiple tasks, and each sequence is assigned a given task. + Prompt-tuned tokens from a given sequence use the adequate task dictionary, as defined by the `tasks` input. """ def __init__(self, @@ -100,6 +103,27 @@ class PromptTuningEmbedding(Embedding): tasks, task_vocab_size, workspace: Optional[Tensor] = None): + """ + Pass all tokens through both normal and prompt embedding tables. + Tokens are masked so that "normal" embedding only see "normal" tokens. Same logic for "prompt" embedding. + After those two embedding, combine results based on whether the token was "normal" or "prompt-tuned". + + Parameters: + tokens : Tensor + the ids to embbed, size [batch_size, seq_len] + + prompt_embedding_table : Tensor + the additional embedding table for prompt-tuned tokens, size [num_tasks * num_tokens_per_task, hidden_size] + + tasks: Tensor + the task required by each token, size [batch_size, seq_len] + + task_vocab_size: Tensor + the number of tokens used for each task, should be equal to prompt_embedding_table's num_tokens_per_task, size [1] + + Returns: + Tokens' embedding + """ # do not use ">=" because internally the layer works with floating points prompt_tokens_mask = tokens > (self.vocab_size - 1) diff --git a/tensorrt_llm/logger.py b/tensorrt_llm/logger.py index 5fa4193ef9..7e8c2179e0 100644 --- a/tensorrt_llm/logger.py +++ b/tensorrt_llm/logger.py @@ -16,7 +16,8 @@ import logging import os import tensorrt as trt -from mpi4py import MPI + +from ._utils import mpi_rank, mpi_world_size try: from polygraphy.logger import G_LOGGER @@ -62,8 +63,8 @@ class Logger(metaclass=Singleton): self._polygraphy_logger.module_severity = severity_map[ min_severity][2] - self.mpi_rank = MPI.COMM_WORLD.Get_rank() - self.mpi_size = MPI.COMM_WORLD.Get_size() + self.mpi_rank = mpi_rank() + self.mpi_size = mpi_world_size() if invalid_severity: self.warning( f"Requested log level {environ_severity} is invalid. Using 'warning' instead" diff --git a/tensorrt_llm/models/__init__.py b/tensorrt_llm/models/__init__.py index 1460593b94..193e7a2c92 100755 --- a/tensorrt_llm/models/__init__.py +++ b/tensorrt_llm/models/__init__.py @@ -15,17 +15,15 @@ from .baichuan.model import BaichuanForCausalLM from .bert.model import BertForQuestionAnswering, BertModel from .bloom.model import BloomForCausalLM, BloomModel -from .chatglm2_6b.model import ChatGLM2_6BHeadModel, ChatGLM2_6BModel -from .chatglm6b.model import ChatGLM6BHeadModel, ChatGLM6BModel +from .chatglm.model import ChatGLMHeadModel, ChatGLMModel from .falcon.model import FalconForCausalLM, FalconModel from .gpt.model import GPTLMHeadModel, GPTModel from .gptj.model import GPTJForCausalLM, GPTJModel from .gptneox.model import GPTNeoXForCausalLM, GPTNeoXModel +from .internlm.model import InternLMForCausalLM, InternLMModel from .llama.model import LLaMAForCausalLM, LLaMAModel from .opt.model import OPTLMHeadModel, OPTModel -from .quantized.quant import (fp8_quantize, smooth_quantize, - weight_only_groupwise_quantize, - weight_only_quantize) +from .quantized.quant import quantize_model # noqa __all__ = [ 'BertModel', @@ -44,13 +42,10 @@ __all__ = [ 'GPTJForCausalLM', 'GPTNeoXModel', 'GPTNeoXForCausalLM', - 'smooth_quantize', - 'weight_only_quantize', - 'weight_only_groupwise_quantize', - 'fp8_quantize', - 'ChatGLM6BHeadModel', - 'ChatGLM6BModel', - 'ChatGLM2_6BHeadModel', - 'ChatGLM2_6BModel', + 'quantize_model', + 'ChatGLMHeadModel', + 'ChatGLMModel', 'BaichuanForCausalLM', + 'InternLMForCausalLM', + 'InternLMModel', ] diff --git a/tensorrt_llm/models/baichuan/model.py b/tensorrt_llm/models/baichuan/model.py index b1a6da7f91..a9cf262e2f 100644 --- a/tensorrt_llm/models/baichuan/model.py +++ b/tensorrt_llm/models/baichuan/model.py @@ -22,6 +22,7 @@ from ...layers import (Attention, AttentionMaskType, AttentionParams, RmsNorm) from ...mapping import Mapping from ...module import Module, ModuleList +from ...quantization import QuantMode from ..generation_mixin import GenerationMixin @@ -32,13 +33,28 @@ class BaichuanDecoderLayer(Module): num_attention_heads, max_position_embeddings, position_embedding_type, + num_kv_heads=None, dtype=None, + attention_mask_type=AttentionMaskType.causal, hidden_act='silu', mlp_hidden_size=None, tp_group=None, tp_size=1, - tp_rank=0): + tp_rank=0, + quant_mode=QuantMode(0)): super().__init__() + # used for quantizing model + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + self.num_kv_heads = num_kv_heads + self.max_position_embeddings = max_position_embeddings + self.dtype = dtype + self.hidden_act = hidden_act + self.tp_group = tp_group + self.tp_size = tp_size + self.mlp_hidden_size = mlp_hidden_size + self.attention_mask_type = attention_mask_type + self.position_embedding_type = position_embedding_type self.input_layernorm = RmsNorm(normalized_shape=hidden_size, dtype=dtype) @@ -46,23 +62,27 @@ class BaichuanDecoderLayer(Module): self.attention = Attention( hidden_size, num_attention_heads, + num_kv_heads=num_kv_heads, max_position_embeddings=max_position_embeddings, dtype=dtype, - attention_mask_type=AttentionMaskType.causal, + attention_mask_type=attention_mask_type, bias=False, position_embedding_type=position_embedding_type, tp_group=tp_group, tp_size=tp_size, - tp_rank=tp_rank) + tp_rank=tp_rank, + use_int8_kv_cache=quant_mode.has_int8_kv_cache(), + quant_mode=quant_mode) if not mlp_hidden_size: - mlp_hidden_size = hidden_size * 4 + self.mlp_hidden_size = hidden_size * 4 self.mlp = GatedMLP(hidden_size=hidden_size, - ffn_hidden_size=mlp_hidden_size, + ffn_hidden_size=self.mlp_hidden_size, hidden_act=hidden_act, dtype=dtype, bias=False, tp_group=tp_group, - tp_size=tp_size) + tp_size=tp_size, + quant_mode=quant_mode) self.post_layernorm = RmsNorm(normalized_shape=hidden_size, dtype=dtype) def forward(self, @@ -101,6 +121,7 @@ class BaichuanModel(Module): def __init__(self, num_layers, num_heads, + num_kv_heads, hidden_size, vocab_size, hidden_act, @@ -108,8 +129,10 @@ class BaichuanModel(Module): position_embedding_type, dtype, mlp_hidden_size=None, - mapping=Mapping()): + mapping=Mapping(), + quant_mode=QuantMode(0)): super().__init__() + self.mapping = mapping self.num_layers = num_layers self.vocab_embedding = Embedding(vocab_size, hidden_size, dtype=dtype) @@ -119,12 +142,14 @@ class BaichuanModel(Module): num_attention_heads=num_heads, max_position_embeddings=max_position_embeddings, position_embedding_type=position_embedding_type, + num_kv_heads=num_kv_heads, dtype=dtype, hidden_act=hidden_act, mlp_hidden_size=mlp_hidden_size, tp_group=mapping.tp_group, tp_size=mapping.tp_size, - tp_rank=mapping.tp_rank) for _ in range(num_layers) + tp_rank=mapping.tp_rank, + quant_mode=quant_mode) for _ in range(num_layers) ]) self.ln_f = RmsNorm(normalized_shape=hidden_size, dtype=dtype) @@ -176,6 +201,7 @@ class BaichuanForCausalLM(BaichuanModel, GenerationMixin): def __init__(self, num_layers, num_heads, + num_kv_heads, hidden_size, vocab_size, hidden_act, @@ -183,22 +209,35 @@ class BaichuanForCausalLM(BaichuanModel, GenerationMixin): position_embedding_type, dtype, mlp_hidden_size=None, - mapping=Mapping()): + mapping=Mapping(), + quant_mode=QuantMode(0)): if isinstance(dtype, str): - self._kv_dtype = str_dtype_to_trt(dtype) + self.dtype = str_dtype_to_trt(dtype) else: assert isinstance(dtype, trt.DataType) - self._kv_dtype = dtype - self._num_layers = num_layers + self.dtype = dtype + + self.num_layers = num_layers self.num_heads = num_heads - self.num_kv_heads = num_heads + if num_kv_heads is None or num_kv_heads <= 0: + num_kv_heads = num_heads + self.num_kv_heads = num_kv_heads self.hidden_size = hidden_size self.vocab_size = vocab_size self.tp_size = mapping.tp_size - super().__init__(num_layers, num_heads, hidden_size, vocab_size, - hidden_act, max_position_embeddings, + + self.kv_dtype = self.dtype + if quant_mode.has_int8_kv_cache(): + self.kv_dtype = str_dtype_to_trt('int8') + elif quant_mode.has_fp8_kv_cache(): + self.kv_dtype = str_dtype_to_trt('fp8') + + self.quant_mode = quant_mode + + super().__init__(num_layers, num_heads, num_kv_heads, hidden_size, + vocab_size, hidden_act, max_position_embeddings, position_embedding_type, dtype, mlp_hidden_size, - mapping) + mapping, quant_mode) vocab_size_padded = pad_vocab_size(vocab_size, mapping.tp_size) self.lm_head = ColumnLinear(hidden_size, vocab_size_padded, @@ -229,11 +268,11 @@ class BaichuanForCausalLM(BaichuanModel, GenerationMixin): # [batch_size, hidden_size] -> [batch_size, vocab_size] lm_logits = self.lm_head(hidden_states) - lm_logits.mark_output('logits', self._kv_dtype) + lm_logits.mark_output('logits', self.dtype) if use_cache and default_net().plugin_config.paged_kv_cache == False: for i, present in enumerate(presents): - present.mark_output(f'present_key_value_{i}', self._kv_dtype) + present.mark_output(f'present_key_value_{i}', self.kv_dtype) return (lm_logits, presents) return lm_logits @@ -253,8 +292,6 @@ class BaichuanForCausalLM(BaichuanModel, GenerationMixin): # Prepare inputs head_size = self.hidden_size // self.num_heads - num_heads_kv = (self.num_kv_heads + self.tp_size - 1) // self.tp_size - remove_input_padding = default_net().plugin_config.remove_input_padding use_gpt_attention_plugin = default_net( ).plugin_config.gpt_attention_plugin @@ -267,15 +304,18 @@ class BaichuanForCausalLM(BaichuanModel, GenerationMixin): max_beam_width, max_input_len, max_new_tokens, - num_heads_kv, + self.num_kv_heads, head_size, - self._num_layers, - self._kv_dtype, + self.num_layers, + self.kv_dtype, remove_input_padding=remove_input_padding, use_gpt_attention_plugin=use_gpt_attention_plugin, use_gemm_plugin=use_gemm_plugin, paged_kv_cache=paged_kv_cache, tokens_per_block=tokens_per_block, + dtype=self.dtype, + num_heads=self.num_heads, + mapping=self.mapping, max_num_tokens=max_num_tokens) return (model_inputs['input_ids'], model_inputs['position_ids'], True, diff --git a/tensorrt_llm/models/chatglm2_6b/__init__.py b/tensorrt_llm/models/chatglm/__init__.py similarity index 100% rename from tensorrt_llm/models/chatglm2_6b/__init__.py rename to tensorrt_llm/models/chatglm/__init__.py diff --git a/tensorrt_llm/models/chatglm2_6b/model.py b/tensorrt_llm/models/chatglm/model.py similarity index 63% rename from tensorrt_llm/models/chatglm2_6b/model.py rename to tensorrt_llm/models/chatglm/model.py index b13d064a1f..5c0d18fd78 100644 --- a/tensorrt_llm/models/chatglm2_6b/model.py +++ b/tensorrt_llm/models/chatglm/model.py @@ -27,39 +27,57 @@ from ...module import Module, ModuleList from ..generation_mixin import GenerationMixin -class ChatGLM2_6BDecoderLayer(Module): +class ChatGLMDecoderLayer(Module): - def __init__(self, args): + def __init__(self, layer_id, args): super().__init__() - self.apply_residual_connection_post_layernorm = args.apply_residual_connection_post_layernorm - self.norm = RmsNorm if args.rmsnorm else LayerNorm + self.model_version = args.model_version self.use_cache = args.use_cache - self.input_layernorm = self.norm( + if self.model_version == "1": + self.alpha = (2 * args.num_layers)**0.5 + self.norm = LayerNorm + else: + self.apply_residual_connection_post_layernorm = args.apply_residual_connection_post_layernorm + self.norm = RmsNorm if args.rmsnorm else LayerNorm + + self.pre_norm = self.norm( normalized_shape=args.hidden_size, - eps=args.layernorm_epsilon, + eps=args.norm_epsilon, + elementwise_affine=True, dtype=args.dtype, ) - self.self_attention = Attention( + self.attention = Attention( hidden_size=args.hidden_size, num_attention_heads=args.num_heads, num_kv_heads=args.num_kv_heads, max_position_embeddings=args.max_seq_length, num_layers=args.num_layers, apply_query_key_layer_scaling=args.apply_query_key_layer_scaling, - attention_mask_type=AttentionMaskType.causal, + attention_mask_type=AttentionMaskType.bidirectional + if args.model_version == "1" else AttentionMaskType.causal, bias=args.qkv_bias, dtype=args.dtype, - position_embedding_type=PositionEmbeddingType.rope_gptj, + position_embedding_type=PositionEmbeddingType.chatglm + if args.model_version == "1" else PositionEmbeddingType.rope_gptj, + rotary_embedding_base=10000.0, + rotary_embedding_scaling=None, use_int8_kv_cache=args.quant_mode.has_int8_kv_cache(), + rotary_embedding_percentage=0.5, tp_group=args.mapping.tp_group, tp_size=args.mapping.tp_size, + tp_rank=args.mapping.rank, multi_block_mode=args.multi_block_mode, quant_mode=args.quant_mode, - rotary_embedding_percentage=0.5, + q_scaling=1.0, + cross_attention=False, + relative_attention=False, + max_distance=0, + num_buckets=0, + instance_id=layer_id * 2, dense_bias=args.linear_bias, ) @@ -67,76 +85,114 @@ class ChatGLM2_6BDecoderLayer(Module): hidden_size=args.hidden_size, ffn_hidden_size=args.ffn_hidden_size, hidden_act=args.hidden_act, - dtype=args.dtype, bias=args.linear_bias, + dtype=args.dtype, tp_group=args.mapping.tp_group, tp_size=args.mapping.tp_size, + quant_mode=args.quant_mode, + instance_id=layer_id * 2 + 1, ) - self.post_layernorm = self.norm( + self.post_norm = self.norm( normalized_shape=args.hidden_size, - eps=args.layernorm_epsilon, + eps=args.norm_epsilon, + elementwise_affine=True, dtype=args.dtype, ) def forward( self, - hidden_states: Tensor = None, + hidden_states: Tensor, + position_ids: Tensor = None, # only used in ChatGLM-6B kv_cache_params: KeyValueCacheParams = None, attention_params: AttentionParams = None, ): - layernorm_output = self.input_layernorm(hidden_states) + norm_output = self.pre_norm(hidden_states) - attention_output = self.self_attention( - hidden_states=layernorm_output, + attention_output = self.attention( + hidden_states=norm_output, + attention_mask=None, use_cache=self.use_cache, kv_cache_params=kv_cache_params, attention_params=attention_params, + encoder_output=None, + workspace=None, + position_embedding=position_ids, ) if self.use_cache: attention_output, presents = attention_output - residual = layernorm_output if self.apply_residual_connection_post_layernorm else hidden_states + if self.model_version == "1": + residual = norm_output - layernorm_input = residual + attention_output + norm_input = residual * self.alpha + attention_output - layernorm_output = self.post_layernorm(layernorm_input) + norm_output = self.post_norm(norm_input) - mlp_output = self.mlp(layernorm_output) + mlp_output = self.mlp(norm_output) - residual = layernorm_output if self.apply_residual_connection_post_layernorm else layernorm_input + residual = norm_output - output = residual + mlp_output + output = residual * self.alpha + mlp_output + + else: + residual = norm_output if self.apply_residual_connection_post_layernorm else hidden_states + + norm_input = residual + attention_output + + norm_output = self.post_norm(norm_input) + + mlp_output = self.mlp(norm_output) + + residual = norm_output if self.apply_residual_connection_post_layernorm else norm_input + + output = residual + mlp_output return (output, presents) if self.use_cache else output -class ChatGLM2_6BTransformer(Module): +class ChatGLMModel(Module): def __init__(self, args): super().__init__() + self.norm = LayerNorm if args.model_version == "1" else RmsNorm self.use_cache = args.use_cache - self.layers = ModuleList( - ChatGLM2_6BDecoderLayer(args) for _ in range(args.num_layers)) + self.embedding = Embedding( + num_embeddings=args.vocab_size, + embedding_dim=args.hidden_size, + dtype=args.dtype, + tp_size=1, #args.mapping.tp_size, + tp_group=None, #args.mapping.tp_group, + sharding_dim=0, + tp_rank=0, #args.mapping.rank, + instance_id=args.num_layers * 2, + ) - self.final_layernorm = RmsNorm( + self.layers = ModuleList( + ChatGLMDecoderLayer(i, args) for i in range(args.num_layers)) + + self.final_norm = self.norm( normalized_shape=args.hidden_size, - eps=args.layernorm_epsilon, + eps=args.norm_epsilon, + elementwise_affine=True, dtype=args.dtype, ) def forward( self, - hidden_states, + input_ids: Tensor = None, + position_ids: Tensor = None, # only used in ChatGLM-6B kv_cache_params: KeyValueCacheParams = None, attention_params: AttentionParams = None, ): + hidden_states = self.embedding(input_ids) + if self.use_cache: presents = [] @@ -145,6 +201,7 @@ class ChatGLM2_6BTransformer(Module): kv_cache_params.kv_cache_block_pointers): layer_output = layer( hidden_states, + position_ids, kv_cache_params=KeyValueCacheParams( past_key_value=[past_key_value], kv_cache_block_pointers=[kv_cache_block_pointers], @@ -159,44 +216,13 @@ class ChatGLM2_6BTransformer(Module): hidden_states = layer_output[0] presents.append(layer_output[1]) - hidden_states = self.final_layernorm(hidden_states) + hidden_states = self.final_norm(hidden_states) return (hidden_states, tuple(presents)) if self.use_cache else hidden_states -class ChatGLM2_6BModel(Module): - - def __init__(self, args): - - super().__init__() - - self.embedding = Embedding( - num_embeddings=args.vocab_size, - embedding_dim=args.hidden_size, - dtype=args.dtype, - ) - - self.encoder = ChatGLM2_6BTransformer(args) - - def forward( - self, - input_ids: Tensor = None, - kv_cache_params: bool = None, - attention_params: bool = None, - ): - - inputs_embeds = self.embedding(input_ids) - - hidden_states, presents = self.encoder( - inputs_embeds, - kv_cache_params=kv_cache_params, - attention_params=attention_params, - ) - return hidden_states, presents - - -class ChatGLM2_6BHeadModel(ChatGLM2_6BModel, GenerationMixin): +class ChatGLMHeadModel(ChatGLMModel, GenerationMixin): def __init__(self, **args): @@ -204,17 +230,27 @@ class ChatGLM2_6BHeadModel(ChatGLM2_6BModel, GenerationMixin): argNamespace = argparse.Namespace() for key, value in args.items(): argNamespace.__setattr__(key, value) + assert "model_version" in args.keys(), "model_version not set" # Other default values - argNamespace.apply_residual_connection_post_layernorm = False - argNamespace.ffn_hidden_size = 13696 - argNamespace.kv_channels = 128 - argNamespace.layernorm_epsilon = 1.0e-5 - argNamespace.linear_bias = False argNamespace.multi_block_mode = False - argNamespace.num_kv_heads = 2 - argNamespace.qkv_bias = True - argNamespace.rmsnorm = True + argNamespace.norm_epsilon = 1.0e-5 + argNamespace.tokens_per_block = 64 argNamespace.use_cache = True + if argNamespace.model_version == "1": + argNamespace.ffn_hidden_size = 16384 + argNamespace.linear_bias = True + argNamespace.max_seq_length = min( + 2048, argNamespace.max_position_embeddings) + argNamespace.num_kv_heads = 32 + argNamespace.qkv_bias = True + else: + argNamespace.apply_residual_connection_post_layernorm = False + argNamespace.ffn_hidden_size = 13696 + argNamespace.linear_bias = False + argNamespace.num_kv_heads = 2 + argNamespace.qkv_bias = True + argNamespace.rmsnorm = True + args = argNamespace else: args = args["args"] @@ -238,26 +274,29 @@ class ChatGLM2_6BHeadModel(ChatGLM2_6BModel, GenerationMixin): self.kv_dtype = str_dtype_to_trt('fp8') self.hidden_size = args.hidden_size + self.mapping = args.mapping + self.max_num_tokens = args.max_output_len + args.max_input_len + self.model_version = args.model_version self.num_heads = args.num_heads self.num_kv_heads = args.num_kv_heads self.num_layers = args.num_layers - self.tp_size = args.mapping.tp_size + self.tokens_per_block = args.tokens_per_block self.use_cache = args.use_cache self.lm_head = ColumnLinear( in_features=self.hidden_size, - out_features=pad_vocab_size(args.vocab_size, self.tp_size), + out_features=pad_vocab_size(args.vocab_size, self.mapping.tp_size), bias=False, dtype=self.dtype, - tp_group=args.mapping.tp_group, - tp_size=self.tp_size, + tp_group=self.mapping.tp_group, + tp_size=self.mapping.tp_size, gather_output=True, ) def forward( self, input_ids: Tensor = None, - position_ids: Tensor = None, + position_ids: Tensor = None, # only used in ChatGLM-6B last_token_ids: Tensor = None, kv_cache_params: KeyValueCacheParams = None, attention_params: AttentionParams = None, @@ -265,6 +304,7 @@ class ChatGLM2_6BHeadModel(ChatGLM2_6BModel, GenerationMixin): hidden_states = super().forward( input_ids, + position_ids, kv_cache_params, attention_params, ) @@ -306,7 +346,7 @@ class ChatGLM2_6BHeadModel(ChatGLM2_6BModel, GenerationMixin): max_beam_width=max_beam_width, max_input_len=max_input_len, max_new_tokens=max_new_tokens, - num_kv_heads=self.num_kv_heads // self.tp_size, + num_kv_heads=self.num_kv_heads // self.mapping.tp_size, head_size=self.hidden_size // self.num_heads, num_layers=self.num_layers, kv_dtype=self.kv_dtype, @@ -315,6 +355,16 @@ class ChatGLM2_6BHeadModel(ChatGLM2_6BModel, GenerationMixin): use_gpt_attention_plugin=default_net().plugin_config. gpt_attention_plugin, use_gemm_plugin=default_net().plugin_config.gemm_plugin, + use_custom_all_reduce=False, + paged_kv_cache=default_net().plugin_config.paged_kv_cache, + tokens_per_block=self.tokens_per_block, + gather_all_token_logits=False, + dtype=self.kv_dtype, + num_heads=self.num_heads, + mapping=self.mapping, + max_num_tokens=self.max_num_tokens, + prompt_embedding_table_size=0, + is_chatglm6b=(self.model_version == "1"), ) return (model_inputs['input_ids'], model_inputs['position_ids'], diff --git a/tensorrt_llm/models/chatglm6b/model.py b/tensorrt_llm/models/chatglm6b/model.py deleted file mode 100644 index 4f66e4e43a..0000000000 --- a/tensorrt_llm/models/chatglm6b/model.py +++ /dev/null @@ -1,370 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse - -import numpy as np -import tensorrt as trt - -from ..._common import default_net -from ..._utils import (pad_vocab_size, str_dtype_to_np, str_dtype_to_trt, - trt_dtype_to_np) -from ...functional import (PositionEmbeddingType, Tensor, concat, - gather_last_token_logits, shape) -from ...layers import (MLP, Attention, AttentionMaskType, AttentionParams, - ColumnLinear, Embedding, KeyValueCacheParams, LayerNorm) -from ...module import Module, ModuleList -from ..generation_mixin import GenerationMixin - - -class ChatGLM6BDecoderLayer(Module): - - def __init__(self, args): - - super().__init__() - - self.use_cache = args.use_cache - - self.input_layernorm = LayerNorm( - normalized_shape=args.hidden_size, - eps=args.layernorm_epsilon, - dtype=args.dtype, - ) - - self.attention = Attention( - hidden_size=args.hidden_size, - num_attention_heads=args.num_heads, - num_kv_heads=args.num_heads, - max_position_embeddings=args.max_seq_length, - num_layers=args.num_layers, - apply_query_key_layer_scaling=args.apply_query_key_layer_scaling, - attention_mask_type=AttentionMaskType.bidirectional, - bias=args.bias, - dtype=args.dtype, - position_embedding_type=PositionEmbeddingType.chatglm, - use_int8_kv_cache=args.quant_mode.has_int8_kv_cache(), - tp_group=args.mapping.tp_group, - tp_size=args.mapping.tp_size, - multi_block_mode=args.multi_block_mode, - quant_mode=args.quant_mode, - ) - - self.mlp = MLP( - hidden_size=args.hidden_size, - ffn_hidden_size=args.ffn_hidden_size, - hidden_act=args.hidden_act, - dtype=args.dtype, - bias=args.bias, - tp_group=args.mapping.tp_group, - tp_size=args.mapping.tp_size, - ) - - self.post_layernorm = LayerNorm( - normalized_shape=args.hidden_size, - eps=args.layernorm_epsilon, - dtype=args.dtype, - ) - - def forward( - self, - hidden_states: Tensor, - position_embedding: Tensor, - kv_cache_params: KeyValueCacheParams = None, - attention_params: AttentionParams = None, - ): - - layernorm_output = self.input_layernorm(hidden_states) - - attention_output = self.attention( - hidden_states=layernorm_output, - attention_mask=None, - use_cache=self.use_cache, - kv_cache_params=kv_cache_params, - attention_params=attention_params, - encoder_output=None, - workspace=None, - position_embedding=position_embedding, - ) - - if self.use_cache: - attention_output, presents = attention_output - - layernorm_input = layernorm_output * 7.484375 + attention_output - - layernorm_output = self.post_layernorm(layernorm_input) - - mlp_output = self.mlp(layernorm_output) - - output = layernorm_output * 7.484375 + mlp_output - - return (output, presents) if self.use_cache else output - - -class ChatGLM6BModel(Module): - - def __init__(self, args): - - super().__init__() - - self.use_cache = args.use_cache - self.half_head_size = args.hidden_size // args.num_heads // 2 - - self.embedding = Embedding( - num_embeddings=args.vocab_size, - embedding_dim=args.hidden_size, - dtype=args.dtype, - ) - - # pre-compute weight of position embedding manually - if isinstance(args.dtype, trt.DataType): - np_dtype = trt_dtype_to_np(args.dtype) - else: - np_dtype = str_dtype_to_np(args.dtype) - - inv_freq = 10**(-1 / 16 * - np.arange(0, 64, 2, dtype=np.float32)).reshape(1, 32) - valueTable = np.matmul( - np.arange(args.max_seq_length, dtype=np.float32).reshape(-1, 1), - np.tile(inv_freq, [1, 2]), - ).reshape(args.max_seq_length, 64) - - self.position_embedding_cos = Embedding( - num_embeddings=args.max_seq_length, - embedding_dim=self.half_head_size, - dtype=args.dtype, - ) - self.position_embedding_sin = Embedding( - num_embeddings=args.max_seq_length, - embedding_dim=self.half_head_size, - dtype=args.dtype, - ) - - self.position_embedding_cos.weight.value = np.cos(valueTable).astype( - np_dtype) - self.position_embedding_sin.weight.value = np.sin(valueTable).astype( - np_dtype) - - self.layers = ModuleList( - ChatGLM6BDecoderLayer(args) for _ in range(args.num_layers)) - - self.final_layernorm = LayerNorm( - normalized_shape=args.hidden_size, - eps=args.layernorm_epsilon, - dtype=args.dtype, - ) - - def forward( - self, - input_ids: Tensor = None, - position_ids: Tensor = None, - kv_cache_params: KeyValueCacheParams = None, - attention_params: AttentionParams = None, - ): - - batch_size = shape(input_ids, 0) - input_len = shape(input_ids, 1) - - hidden_states = self.embedding(input_ids) - - position_embedding_cos = self.position_embedding_cos(position_ids) - position_embedding_sin = self.position_embedding_sin(position_ids) - - position_embedding_cos0, position_embedding_cos1 = position_embedding_cos.split( - 1, dim=1) - position_embedding_sin0, position_embedding_sin1 = position_embedding_sin.split( - 1, dim=1) - - position_embedding_cos0 = position_embedding_cos0.view( - concat([batch_size, input_len, 1, self.half_head_size])) - position_embedding_cos1 = position_embedding_cos1.view( - concat([batch_size, input_len, 1, self.half_head_size])) - position_embedding_sin0 = position_embedding_sin0.view( - concat([batch_size, input_len, 1, self.half_head_size])) - position_embedding_sin1 = position_embedding_sin1.view( - concat([batch_size, input_len, 1, self.half_head_size])) - - position_embedding = [ - position_embedding_cos0, position_embedding_cos1, - position_embedding_sin0, position_embedding_sin1 - ] - - if kv_cache_params.past_key_value is None: - kv_cache_params.past_key_value = tuple([None] * len(self.layers)) - - if self.use_cache: - presents = [] - - for layer, past_key_value, kv_cache_block_pointers in zip( - self.layers, kv_cache_params.past_key_value, - kv_cache_params.kv_cache_block_pointers): - layer_output = layer( - hidden_states, - position_embedding, - kv_cache_params=KeyValueCacheParams( - past_key_value=[past_key_value], - kv_cache_block_pointers=[kv_cache_block_pointers], - host_past_key_value_lengths=kv_cache_params. - host_past_key_value_lengths, - cache_indirection=kv_cache_params.cache_indirection, - ), - attention_params=attention_params, - ) - - if self.use_cache: - hidden_states = layer_output[0] - presents.append(layer_output[1]) - - hidden_states = self.final_layernorm(hidden_states) - - return (hidden_states, - tuple(presents)) if self.use_cache else hidden_states - - -class ChatGLM6BHeadModel(ChatGLM6BModel, GenerationMixin): - - def __init__(self, **args): - - if "args" not in args.keys(): - argNamespace = argparse.Namespace() - for key, value in args.items(): - argNamespace.__setattr__(key, value) - # Other default values - argNamespace.bias = True - argNamespace.ffn_hidden_size = 16384 - argNamespace.layernorm_epsilon = 1.0e-5 - argNamespace.max_seq_length = argNamespace.max_position_embeddings - argNamespace.multi_block_mode = False - argNamespace.num_kv_heads = 32 - argNamespace.use_cache = True - args = argNamespace - else: - args = args["args"] - - self.init(args) - - def init(self, args): - - super().__init__(args) - - if isinstance(args.dtype, str): - self.kv_dtype = str_dtype_to_trt(args.dtype) - else: - assert isinstance(args.dtype, trt.DataType) - self.kv_dtype = args.dtype - self.dtype = self.kv_dtype - - if args.quant_mode.has_int8_kv_cache(): - self.kv_dtype = str_dtype_to_trt('int8') - elif args.quant_mode.has_fp8_kv_cache(): - self.kv_dtype = str_dtype_to_trt('fp8') - - self.hidden_size = args.hidden_size - self.num_heads = args.num_heads - self.num_kv_heads = args.num_kv_heads - self.num_layers = args.num_layers - self.tp_size = args.mapping.tp_size - self.use_cache = args.use_cache - - self.lm_head = ColumnLinear( - in_features=self.hidden_size, - out_features=pad_vocab_size(args.vocab_size, self.tp_size), - bias=False, - dtype=self.dtype, - tp_group=args.mapping.tp_group, - tp_size=self.tp_size, - gather_output=True, - ) - - def forward( - self, - input_ids: Tensor = None, - position_ids: Tensor = None, - last_token_ids: Tensor = None, - kv_cache_params: KeyValueCacheParams = None, - attention_params: AttentionParams = None, - ): - - hidden_states = super().forward( - input_ids, - position_ids, - kv_cache_params, - attention_params, - ) - - if self.use_cache: - hidden_states, presents = hidden_states - - hidden_states = gather_last_token_logits( - hidden_states, last_token_ids, - default_net().plugin_config.remove_input_padding) - - lm_logits = self.lm_head(hidden_states) - lm_logits.mark_output('logits', self.dtype) - - if self.use_cache and default_net( - ).plugin_config.paged_kv_cache == False: - for i, present in enumerate(presents): - present.mark_output(f'present_key_value_{i}', self.kv_dtype) - return (lm_logits, presents) - - return lm_logits - - def prepare_inputs( - self, - max_batch_size: int = 0, - max_input_len: int = 0, - max_new_tokens: int = 0, - use_cache: bool = True, - max_beam_width: int = 1, - ): - '''@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the - ranges of the dimensions of when using TRT dynamic shapes. - - @return: a list contains values which can be fed into the self.forward() - ''' - - model_inputs = self.prepare_basic_inputs( - max_batch_size=max_batch_size, - max_beam_width=max_beam_width, - max_input_len=max_input_len, - max_new_tokens=max_new_tokens, - num_kv_heads=self.num_kv_heads // self.tp_size, - head_size=self.hidden_size // self.num_heads, - num_layers=self.num_layers, - kv_dtype=self.kv_dtype, - remove_input_padding=default_net( - ).plugin_config.remove_input_padding, - use_gpt_attention_plugin=default_net().plugin_config. - gpt_attention_plugin, - use_gemm_plugin=default_net().plugin_config.gemm_plugin, - is_chatglm6b=True, - ) - - return (model_inputs['input_ids'], model_inputs['position_ids'], - model_inputs['last_token_ids'], - KeyValueCacheParams( - past_key_value=model_inputs['past_key_value'], - host_past_key_value_lengths=model_inputs[ - 'host_past_key_value_lengths'], - kv_cache_block_pointers=model_inputs[ - 'kv_cache_block_pointers_list'], - cache_indirection=model_inputs['cache_indirection'], - ), - AttentionParams( - sequence_length=model_inputs['sequence_length'], - context_lengths=model_inputs['context_lengths'], - host_context_lengths=model_inputs['host_context_lengths'], - max_context_length=max_input_len, - host_request_types=model_inputs['host_request_types'], - )) diff --git a/tensorrt_llm/models/generation_mixin.py b/tensorrt_llm/models/generation_mixin.py index 8a2cac99a7..61be1f3b68 100644 --- a/tensorrt_llm/models/generation_mixin.py +++ b/tensorrt_llm/models/generation_mixin.py @@ -22,7 +22,6 @@ from ..mapping import Mapping class GenerationMixin: - _use_prompt_tuning = False def get_transformer_layers(self, mapping, num_layers): layers_per_pipeline_stage = num_layers // mapping.pp_size @@ -51,7 +50,7 @@ class GenerationMixin: num_heads=None, mapping=Mapping(), max_num_tokens=None, - prompt_embedding_table_size=None, + prompt_embedding_table_size: int = 0, is_chatglm6b=False): max_len = max_input_len + max_new_tokens @@ -135,15 +134,30 @@ class GenerationMixin: [1, 1] if enable_two_optimization_profiles else [1]), ('num_tokens', num_tokens_range), ])) - position_ids = Tensor( - name='position_ids', - dtype=trt.int32, - shape=[1, -1], - dim_range=OrderedDict([ - ('batch_size_fake', - [1, 1] if enable_two_optimization_profiles else [1]), - ('num_tokens', num_tokens_range), - ])) + if is_chatglm6b: + position_ids = Tensor( + name='position_ids', + dtype=trt.int32, + shape=[1, 2, -1], + dim_range=OrderedDict([ + ('batch_size_fake', [1, 1] + if enable_two_optimization_profiles else [1]), + ('2', [2, 2] + if enable_two_optimization_profiles else [2]), + ('num_tokens', num_tokens_range), + ]), + ) + else: + position_ids = Tensor( + name='position_ids', + dtype=trt.int32, + shape=[1, -1], + dim_range=OrderedDict([ + ('batch_size_fake', [1, 1] + if enable_two_optimization_profiles else [1]), + ('num_tokens', num_tokens_range), + ]), + ) else: assert dtype is not None assert num_heads is not None @@ -180,16 +194,18 @@ class GenerationMixin: ('2', [2, 2] if enable_two_optimization_profiles else [2]), ('input_len', inlen_range), - ])) + ]), + ) else: - position_ids = Tensor(name='position_ids', - dtype=trt.int32, - shape=[-1, -1], - dim_range=OrderedDict([ - ('batch_size_beam_width', - bb_range), - ('input_len', inlen_range), - ])) + position_ids = Tensor( + name='position_ids', + dtype=trt.int32, + shape=[-1, -1], + dim_range=OrderedDict([ + ('batch_size_beam_width', bb_range), + ('input_len', inlen_range), + ]), + ) else: assert dtype is not None assert num_heads is not None @@ -389,9 +405,8 @@ class GenerationMixin: prompt_embedding_table = None tasks = None prompt_vocab_size = None - if self._use_prompt_tuning: + if prompt_embedding_table_size > 0: hidden_size = num_heads * head_size - assert prompt_embedding_table_size is not None, "prompt_embedding_table_size cannot be None when self._use_prompt_tuning is True" _p_embedding_range = [ 1, prompt_embedding_table_size // 2, prompt_embedding_table_size ] diff --git a/tensorrt_llm/models/gpt/model.py b/tensorrt_llm/models/gpt/model.py index 0135bc8eea..98580565a2 100644 --- a/tensorrt_llm/models/gpt/model.py +++ b/tensorrt_llm/models/gpt/model.py @@ -376,7 +376,6 @@ class GPTLMHeadModel(GPTModel, GenerationMixin): self._vocab_size = vocab_size self._tp_size = mapping.tp_size self._multi_query_mode = multi_query_mode - self._use_prompt_tuning = use_prompt_tuning super().__init__(num_layers, num_heads, hidden_size, vocab_size, hidden_act, max_position_embeddings, dtype, mapping, @@ -444,7 +443,7 @@ class GPTLMHeadModel(GPTModel, GenerationMixin): use_cache, max_beam_width: int = 1, max_num_tokens: int = None, - prompt_embedding_table_size: int = 128, + prompt_embedding_table_size: int = 0, gather_all_token_logits: bool = False): '''@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes. diff --git a/tensorrt_llm/models/gptj/model.py b/tensorrt_llm/models/gptj/model.py index 0f1d6a7e63..4fdeee9db6 100644 --- a/tensorrt_llm/models/gptj/model.py +++ b/tensorrt_llm/models/gptj/model.py @@ -295,7 +295,8 @@ class GPTJForCausalLM(GPTJModel, GenerationMixin): use_custom_all_reduce=use_custom_all_reduce, paged_kv_cache=paged_kv_cache, tokens_per_block=tokens_per_block, - mapping=self.mapping) + mapping=self.mapping, + max_num_tokens=max_num_tokens) return (model_inputs['input_ids'], model_inputs['position_ids'], True, model_inputs['last_token_ids'], diff --git a/tensorrt_llm/models/gptneox/model.py b/tensorrt_llm/models/gptneox/model.py index 47e985f7f9..0202cc59dc 100644 --- a/tensorrt_llm/models/gptneox/model.py +++ b/tensorrt_llm/models/gptneox/model.py @@ -17,108 +17,14 @@ import tensorrt as trt from ..._common import default_net from ..._utils import pad_vocab_size, str_dtype_to_trt from ...functional import (PositionEmbeddingType, Tensor, - gather_last_token_logits, gpt_attention) -from ...layers import (MLP, AttentionMaskType, AttentionParams, ColumnLinear, - Embedding, KeyValueCacheParams, LayerNorm, RowLinear) + gather_last_token_logits) +from ...layers import (MLP, Attention, AttentionMaskType, AttentionParams, + ColumnLinear, Embedding, KeyValueCacheParams, LayerNorm) from ...mapping import Mapping from ...module import Module, ModuleList -from ...parameter import Parameter -from ...quantization import QuantMode from ..generation_mixin import GenerationMixin -class GPTNeoXAttention(Module): - - def __init__(self, - hidden_size, - num_attention_heads, - rotary_dim, - max_position_embeddings, - dtype=None, - multi_block_mode=False, - position_embedding_type=PositionEmbeddingType.rope_gpt_neox, - quant_mode=QuantMode(0), - tp_group=None, - tp_size=1): - super().__init__() - self.attention_head_size = hidden_size // num_attention_heads - self.num_attention_heads = num_attention_heads // tp_size - self.max_position_embeddings = max_position_embeddings - self.rotary_dim = rotary_dim - self.position_embedding_type = position_embedding_type - self.multi_block_mode = multi_block_mode - self.multi_query_mode = False - self.quant_mode = quant_mode - - if self.quant_mode.has_int8_kv_cache(): - self.kv_quantization_scale = Parameter(shape=(1, ), dtype='float32') - self.kv_dequantization_scale = Parameter(shape=(1, ), - dtype='float32') - else: - self.register_parameter('kv_quantization_scale', None) - self.register_parameter('kv_dequantization_scale', None) - - self.qkv = ColumnLinear(in_features=hidden_size, - out_features=hidden_size * 3, - bias=True, - tp_group=tp_group, - tp_size=tp_size, - gather_output=False, - dtype=dtype) - self.dense = RowLinear(in_features=hidden_size, - out_features=hidden_size, - bias=True, - dtype=dtype, - tp_group=tp_group, - tp_size=tp_size) - - def forward(self, - hidden_states: Tensor, - attention_mask=None, - use_cache=False, - kv_cache_params=None, - attention_params=None): - if not default_net().plugin_config.gpt_attention_plugin: - raise ValueError( - 'GPT-NeoX RoPE is only supported with GPTAttention plugin') - qkv = self.qkv(hidden_states) - - assert attention_params.is_valid( - default_net().plugin_config.gpt_attention_plugin, - default_net().plugin_config.remove_input_padding) - assert kv_cache_params.is_valid( - default_net().plugin_config.gpt_attention_plugin) - - context, past_key_value = gpt_attention( - tensor=qkv, - past_key_value=kv_cache_params.get_first_past_key_value(), - sequence_length=attention_params.sequence_length, - host_past_key_value_lengths=kv_cache_params. - host_past_key_value_lengths, - context_lengths=attention_params.context_lengths, - cache_indirection=kv_cache_params.cache_indirection, - host_request_types=attention_params.host_request_types, - num_heads=self.num_attention_heads, - num_kv_heads=self.num_attention_heads, - hidden_size_per_head=self.attention_head_size, - q_scaling=1.0, - rotary_embedding_dim=self.rotary_dim, - position_embedding_type=self.position_embedding_type, - multi_block_mode=self.multi_block_mode, - kv_orig_quant_scale=self.kv_quantization_scale, - kv_quant_orig_scale=self.kv_dequantization_scale, - kv_cache_quant_mode=self.quant_mode, - max_context_length=attention_params.max_context_length, - host_context_lengths=attention_params.host_context_lengths) - - context = self.dense(context) - - if use_cache: - return (context, past_key_value) - - return context - - class GPTNeoXDecoderLayer(Module): def __init__(self, @@ -141,13 +47,16 @@ class GPTNeoXDecoderLayer(Module): self.post_attention_layernorm = LayerNorm(normalized_shape=hidden_size, dtype=dtype) - self.attention = GPTNeoXAttention( + self.attention = Attention( hidden_size=hidden_size, num_attention_heads=num_attention_heads, - rotary_dim=rotary_dim, + rotary_embedding_percentage=rotary_dim / + (hidden_size // num_attention_heads), + position_embedding_type=PositionEmbeddingType.rope_gpt_neox, max_position_embeddings=max_position_embeddings, dtype=dtype, - position_embedding_type=position_embedding_type, + attention_mask_type=AttentionMaskType.causal, + bias=True, tp_group=tp_group, tp_size=tp_size) @@ -179,7 +88,8 @@ class GPTNeoXDecoderLayer(Module): attention_mask=attention_mask, use_cache=use_cache, kv_cache_params=kv_cache_params, - attention_params=attention_params) + attention_params=attention_params, + norm_before_bmm1=True) if use_cache: attention_output, presents = attention_output diff --git a/tensorrt_llm/models/chatglm6b/__init__.py b/tensorrt_llm/models/internlm/__init__.py similarity index 100% rename from tensorrt_llm/models/chatglm6b/__init__.py rename to tensorrt_llm/models/internlm/__init__.py diff --git a/tensorrt_llm/models/internlm/model.py b/tensorrt_llm/models/internlm/model.py new file mode 100644 index 0000000000..2324757e42 --- /dev/null +++ b/tensorrt_llm/models/internlm/model.py @@ -0,0 +1,427 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import tensorrt as trt + +from ..._common import default_net +from ..._utils import pad_vocab_size, str_dtype_to_trt +from ...functional import gather_last_token_logits, recv, send +from ...layers import (Attention, AttentionMaskType, AttentionParams, + ColumnLinear, Embedding, GatedMLP, KeyValueCacheParams, + PositionEmbeddingType, RmsNorm) +from ...mapping import Mapping +from ...module import Module, ModuleList +from ...quantization import QuantMode +from ..generation_mixin import GenerationMixin + + +class InternLMDecoderLayer(Module): + + def __init__(self, + layer_id, + hidden_size, + num_attention_heads, + num_kv_heads=None, + max_position_embeddings=2048, + dtype=None, + attention_mask_type=AttentionMaskType.causal, + hidden_act='silu', + attn_bias=True, + position_embedding_type=PositionEmbeddingType.rope_gpt_neox, + rotary_base=10000.0, + rotary_scaling=None, + mlp_hidden_size=None, + tp_group=None, + tp_size=1, + quant_mode=QuantMode(0), + rms_norm_eps=1e-06): + super().__init__() + self._layer_id = layer_id # useful for debugging + # used for quantizing model + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + self.num_kv_heads = num_kv_heads + self.max_position_embeddings = max_position_embeddings + self.dtype = dtype + self.hidden_act = hidden_act + self.tp_group = tp_group + self.tp_size = tp_size + self.mlp_hidden_size = mlp_hidden_size + self.attention_mask_type = attention_mask_type + self.position_embedding_type = position_embedding_type + self.input_layernorm = RmsNorm(normalized_shape=hidden_size, + eps=rms_norm_eps, + dtype=dtype) + + self.attention = Attention( + hidden_size, + num_attention_heads, + num_kv_heads, + max_position_embeddings, + dtype=dtype, + attention_mask_type=AttentionMaskType.causal, + bias=attn_bias, + position_embedding_type=position_embedding_type, + rotary_embedding_base=rotary_base, + rotary_embedding_scaling=rotary_scaling, + tp_group=tp_group, + tp_size=tp_size, + use_int8_kv_cache=quant_mode.has_int8_kv_cache(), + quant_mode=quant_mode, + instance_id=2 * layer_id, + ) + if not mlp_hidden_size: + self.mlp_hidden_size = hidden_size * 4 + self.mlp = GatedMLP(hidden_size=hidden_size, + ffn_hidden_size=self.mlp_hidden_size, + hidden_act=hidden_act, + dtype=dtype, + bias=False, + tp_group=tp_group, + tp_size=tp_size, + quant_mode=quant_mode, + instance_id=2 * layer_id + 1) + self.post_layernorm = RmsNorm(normalized_shape=hidden_size, + eps=rms_norm_eps, + dtype=dtype) + + def forward(self, + hidden_states, + attention_mask=None, + use_cache=False, + kv_cache_params=None, + attention_params=None, + all_reduce_workspace=None): + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + if self._layer_id == 0: + self.register_network_output(f"norm0", hidden_states) + + attention_output = self.attention(hidden_states, + attention_mask=attention_mask, + use_cache=use_cache, + kv_cache_params=kv_cache_params, + attention_params=attention_params, + workspace=all_reduce_workspace) + + if use_cache: + attention_output, presents = attention_output + if self._layer_id == 0: + self.register_network_output(f"attn", attention_output) + + hidden_states = residual + attention_output + + residual = hidden_states + hidden_states = self.post_layernorm(hidden_states) + if self._layer_id == 0: + self.register_network_output(f"norm1", hidden_states) + + hidden_states = self.mlp(hidden_states, all_reduce_workspace) + if self._layer_id == 0: + self.register_network_output(f"mlp", hidden_states) + + hidden_states = residual + hidden_states + if use_cache: + return (hidden_states, presents) + return hidden_states + + +class InternLMModel(Module): + + def __init__(self, + num_layers, + num_heads, + num_kv_heads, + hidden_size, + vocab_size, + hidden_act, + attn_bias, + max_position_embeddings, + dtype, + mlp_hidden_size=None, + position_embedding_type=PositionEmbeddingType.rope_gpt_neox, + rotary_base=10000.0, + rotary_scaling=None, + mapping=Mapping(), + quant_mode=QuantMode(0), + use_parallel_embedding=False, + embedding_sharding_dim=0, + rms_norm_eps=1e-06): + super().__init__() + self.mapping = mapping + + if self.mapping.is_first_pp_rank(): + self.vocab_embedding = Embedding( + num_embeddings=vocab_size, + embedding_dim=hidden_size, + dtype=dtype, + tp_size=mapping.tp_size if use_parallel_embedding else 1, + tp_group=mapping.tp_group if use_parallel_embedding else None, + sharding_dim=embedding_sharding_dim, + tp_rank=mapping.tp_rank) + + self.layers = ModuleList([ + InternLMDecoderLayer( + layer_id=i, + hidden_size=hidden_size, + num_attention_heads=num_heads, + num_kv_heads=num_kv_heads, + max_position_embeddings=max_position_embeddings, + dtype=dtype, + hidden_act=hidden_act, + attn_bias=attn_bias, + mlp_hidden_size=mlp_hidden_size, + position_embedding_type=position_embedding_type, + rotary_base=rotary_base, + rotary_scaling=rotary_scaling, + tp_group=mapping.tp_group, + tp_size=mapping.tp_size, + quant_mode=quant_mode, + rms_norm_eps=rms_norm_eps) + for i in self.get_transformer_layers(self.mapping, num_layers) + ]) + + if self.mapping.is_last_pp_rank(): + self.ln_f = RmsNorm(normalized_shape=hidden_size, + eps=rms_norm_eps, + dtype=dtype) + + def forward(self, + input_ids, + position_ids=None, + use_cache=False, + attention_mask=None, + kv_cache_params=None, + attention_params=None, + hidden_states=None, + all_reduce_workspace=None): + + if kv_cache_params.past_key_value is None: + tuple([None] * len(self.layers)) + + if use_cache: + presents = [] + + if self.mapping.is_first_pp_rank(): + hidden_states = self.vocab_embedding(input_ids) + else: + hidden_states = recv(hidden_states, self.mapping.prev_pp_rank()) + self.register_network_output(f"embd", hidden_states) + + for layer, past, pointer in zip( + self.layers, kv_cache_params.past_key_value, + kv_cache_params.kv_cache_block_pointers): + hidden_states = layer( + hidden_states, + use_cache=use_cache, + attention_mask=attention_mask, + kv_cache_params=KeyValueCacheParams( + past_key_value=[past], + host_past_key_value_lengths=kv_cache_params. + host_past_key_value_lengths, + kv_cache_block_pointers=[pointer], + cache_indirection=kv_cache_params.cache_indirection), + attention_params=attention_params, + all_reduce_workspace=all_reduce_workspace) + + if use_cache: + presents.append(hidden_states[1]) + hidden_states = hidden_states[0] + + if self.mapping.is_last_pp_rank(): + hidden_states = self.ln_f(hidden_states) + else: + hidden_states = send(hidden_states, self.mapping.next_pp_rank()) + + if use_cache: + return (hidden_states, tuple(presents)) + return hidden_states + + +class InternLMForCausalLM(InternLMModel, GenerationMixin): + + def __init__(self, + num_layers, + num_heads, + num_kv_heads, + hidden_size, + vocab_size, + hidden_act, + attn_bias, + max_position_embeddings, + dtype, + logits_dtype="float32", + mlp_hidden_size=None, + position_embedding_type=PositionEmbeddingType.rope_gpt_neox, + rotary_base=10000.0, + rotary_scaling=None, + mapping=Mapping(), + quant_mode=QuantMode(0), + use_parallel_embedding=False, + embedding_sharding_dim=0, + rms_norm_eps=1e-06): + + if isinstance(dtype, str): + self.dtype = str_dtype_to_trt(dtype) + else: + assert isinstance(dtype, trt.DataType) + self.dtype = dtype + + if isinstance(logits_dtype, str): + self.logits_dtype = str_dtype_to_trt(logits_dtype) + else: + assert isinstance(logits_dtype, trt.DataType) + self.logits_dtype = logits_dtype + + self.num_layers = num_layers + self.num_heads = num_heads + if num_kv_heads is None or num_kv_heads <= 0: + num_kv_heads = num_heads + self.num_kv_heads = num_kv_heads + self.hidden_size = hidden_size + self.attn_bias = attn_bias + self.vocab_size = vocab_size + self.tp_size = mapping.tp_size + + self.kv_dtype = self.dtype + if quant_mode.has_int8_kv_cache(): + self.kv_dtype = str_dtype_to_trt('int8') + elif quant_mode.has_fp8_kv_cache(): + self.kv_dtype = str_dtype_to_trt('fp8') + + self.quant_mode = quant_mode + self.use_parallel_embedding = use_parallel_embedding + self.embedding_sharding_dim = embedding_sharding_dim + + super().__init__(num_layers, num_heads, num_kv_heads, hidden_size, + vocab_size, hidden_act, attn_bias, + max_position_embeddings, dtype, mlp_hidden_size, + position_embedding_type, rotary_base, rotary_scaling, + mapping, quant_mode, use_parallel_embedding, + embedding_sharding_dim, rms_norm_eps) + + vocab_size_padded = pad_vocab_size(vocab_size, mapping.tp_size) + if self.mapping.is_last_pp_rank(): + self.lm_head = ColumnLinear(hidden_size, + vocab_size_padded, + bias=False, + dtype=dtype, + tp_group=mapping.tp_group, + tp_size=mapping.tp_size, + gather_output=True) + + def forward(self, + input_ids, + position_ids=None, + use_cache=False, + last_token_ids=None, + attention_mask=None, + kv_cache_params=None, + attention_params=None, + hidden_states=None, + all_reduce_workspace=None): + hidden_states = super().forward(input_ids, position_ids, use_cache, + attention_mask, kv_cache_params, + attention_params, hidden_states, + all_reduce_workspace) + + if use_cache: + hidden_states, presents = hidden_states + + if self.mapping.is_last_pp_rank(): + hidden_states = gather_last_token_logits( + hidden_states, last_token_ids, + default_net().plugin_config.remove_input_padding) + + # [batch_size, hidden_size] -> [batch_size, vocab_size] + lm_logits = self.lm_head(hidden_states) + lm_logits.mark_output('logits', self.logits_dtype) + else: + hidden_states.mark_output('hidden_states_output', self.dtype) + + if use_cache and default_net().plugin_config.paged_kv_cache == False: + for i, present in zip( + self.get_transformer_layers(self.mapping, self.num_layers), + presents): + present.mark_output(f'present_key_value_{i}', self.kv_dtype) + if self.mapping.is_last_pp_rank(): + return (lm_logits, presents) + return (hidden_states, presents) + else: + if self.mapping.is_last_pp_rank(): + return lm_logits + return hidden_states + + def prepare_inputs(self, + max_batch_size, + max_input_len, + max_new_tokens, + use_cache, + max_beam_width, + max_num_tokens: int = None): + '''@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the + ranges of the dimensions of when using TRT dynamic shapes. + + @return: a list contains values which can be fed into the self.forward() + ''' + + # Prepare inputs + head_size = self.hidden_size // self.num_heads + remove_input_padding = default_net().plugin_config.remove_input_padding + use_gpt_attention_plugin = default_net( + ).plugin_config.gpt_attention_plugin + use_gemm_plugin = default_net().plugin_config.gemm_plugin + paged_kv_cache = default_net().plugin_config.paged_kv_cache + tokens_per_block = default_net().plugin_config.tokens_per_block + use_custom_all_reduce = default_net( + ).plugin_config.use_custom_all_reduce + + model_inputs = self.prepare_basic_inputs( + max_batch_size, + max_beam_width, + max_input_len, + max_new_tokens, + self.num_kv_heads, + head_size, + self.num_layers, + self.kv_dtype, + remove_input_padding=remove_input_padding, + use_gpt_attention_plugin=use_gpt_attention_plugin, + use_gemm_plugin=use_gemm_plugin, + use_custom_all_reduce=use_custom_all_reduce, + paged_kv_cache=paged_kv_cache, + tokens_per_block=tokens_per_block, + dtype=self.dtype, + num_heads=self.num_heads, + mapping=self.mapping, + max_num_tokens=max_num_tokens) + + return (model_inputs['input_ids'], model_inputs['position_ids'], True, + model_inputs['last_token_ids'], model_inputs['attention_mask'], + KeyValueCacheParams( + past_key_value=model_inputs['past_key_value'], + host_past_key_value_lengths=model_inputs[ + 'host_past_key_value_lengths'], + kv_cache_block_pointers=model_inputs[ + 'kv_cache_block_pointers_list'], + cache_indirection=model_inputs['cache_indirection'], + ), + AttentionParams( + sequence_length=model_inputs['sequence_length'], + context_lengths=model_inputs['context_lengths'], + host_context_lengths=model_inputs['host_context_lengths'], + max_context_length=max_input_len, + host_request_types=model_inputs['host_request_types']), + model_inputs['hidden_states_input'], + model_inputs['all_reduce_workspace']) diff --git a/tensorrt_llm/models/llama/model.py b/tensorrt_llm/models/llama/model.py index 2270f61894..896ca77fa0 100644 --- a/tensorrt_llm/models/llama/model.py +++ b/tensorrt_llm/models/llama/model.py @@ -12,14 +12,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + import tensorrt as trt from ..._common import default_net from ..._utils import pad_vocab_size, str_dtype_to_trt -from ...functional import gather_last_token_logits, recv, send +from ...functional import Tensor, gather_last_token_logits, recv, send from ...layers import (Attention, AttentionMaskType, AttentionParams, ColumnLinear, Embedding, FusedGatedMLP, GatedMLP, - KeyValueCacheParams, PositionEmbeddingType, RmsNorm) + KeyValueCacheParams, PositionEmbeddingType, + PromptTuningEmbedding, RmsNorm) from ...mapping import Mapping from ...module import Module, ModuleList from ...quantization import QuantMode @@ -158,12 +161,15 @@ class LLaMAModel(Module): use_parallel_embedding=False, embedding_sharding_dim=0, rms_norm_eps=1e-06, - use_fused_mlp=False): + use_fused_mlp=False, + use_prompt_tuning: bool = False): super().__init__() self.mapping = mapping + self.use_prompt_tuning = use_prompt_tuning + EmbeddingCls = PromptTuningEmbedding if use_prompt_tuning else Embedding if self.mapping.is_first_pp_rank(): - self.vocab_embedding = Embedding( + self.vocab_embedding = EmbeddingCls( num_embeddings=vocab_size, embedding_dim=hidden_size, dtype=dtype, @@ -200,15 +206,20 @@ class LLaMAModel(Module): eps=rms_norm_eps, dtype=dtype) - def forward(self, - input_ids, - position_ids=None, - use_cache=False, - attention_mask=None, - kv_cache_params=None, - attention_params=None, - hidden_states=None, - all_reduce_workspace=None): + def forward( + self, + input_ids, + position_ids=None, + use_cache=False, + attention_mask=None, + kv_cache_params=None, + attention_params=None, + hidden_states=None, + all_reduce_workspace=None, + prompt_embedding_table: Optional[Tensor] = None, + prompt_tasks: Optional[Tensor] = None, + prompt_vocab_size: Optional[Tensor] = None, + ): if kv_cache_params.past_key_value is None: tuple([None] * len(self.layers)) @@ -216,8 +227,13 @@ class LLaMAModel(Module): if use_cache: presents = [] + ptuning_args = [] + if self.use_prompt_tuning: + ptuning_args = [ + prompt_embedding_table, prompt_tasks, prompt_vocab_size + ] if self.mapping.is_first_pp_rank(): - hidden_states = self.vocab_embedding(input_ids, + hidden_states = self.vocab_embedding(input_ids, *ptuning_args, all_reduce_workspace) else: hidden_states = recv(hidden_states, self.mapping.prev_pp_rank()) @@ -274,7 +290,8 @@ class LLaMAForCausalLM(LLaMAModel, GenerationMixin): use_parallel_embedding=False, embedding_sharding_dim=0, rms_norm_eps=1e-06, - use_fused_mlp=False): + use_fused_mlp=False, + use_prompt_tuning: bool = False): if isinstance(dtype, str): self.dtype = str_dtype_to_trt(dtype) @@ -312,7 +329,7 @@ class LLaMAForCausalLM(LLaMAModel, GenerationMixin): mlp_hidden_size, position_embedding_type, rotary_base, rotary_scaling, mapping, quant_mode, use_parallel_embedding, embedding_sharding_dim, - rms_norm_eps, use_fused_mlp) + rms_norm_eps, use_fused_mlp, use_prompt_tuning) vocab_size_padded = pad_vocab_size(vocab_size, mapping.tp_size) if self.mapping.is_last_pp_rank(): @@ -324,20 +341,27 @@ class LLaMAForCausalLM(LLaMAModel, GenerationMixin): tp_size=mapping.tp_size, gather_output=True) - def forward(self, - input_ids, - position_ids=None, - use_cache=False, - last_token_ids=None, - attention_mask=None, - kv_cache_params=None, - attention_params=None, - hidden_states=None, - all_reduce_workspace=None): + def forward( + self, + input_ids, + position_ids=None, + use_cache=False, + last_token_ids=None, + attention_mask=None, + kv_cache_params=None, + attention_params=None, + hidden_states=None, + all_reduce_workspace=None, + prompt_embedding_table: Optional[Tensor] = None, + prompt_tasks: Optional[Tensor] = None, + prompt_vocab_size: Optional[Tensor] = None, + ): hidden_states = super().forward(input_ids, position_ids, use_cache, attention_mask, kv_cache_params, attention_params, hidden_states, - all_reduce_workspace) + all_reduce_workspace, + prompt_embedding_table, prompt_tasks, + prompt_vocab_size) if use_cache: hidden_states, presents = hidden_states @@ -366,13 +390,16 @@ class LLaMAForCausalLM(LLaMAModel, GenerationMixin): return lm_logits return hidden_states - def prepare_inputs(self, - max_batch_size, - max_input_len, - max_new_tokens, - use_cache, - max_beam_width, - max_num_tokens: int = None): + def prepare_inputs( + self, + max_batch_size, + max_input_len, + max_new_tokens, + use_cache, + max_beam_width, + max_num_tokens: int = None, + prompt_embedding_table_size: int = 0, + ): '''@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes. @@ -408,23 +435,33 @@ class LLaMAForCausalLM(LLaMAModel, GenerationMixin): dtype=self.dtype, num_heads=self.num_heads, mapping=self.mapping, - max_num_tokens=max_num_tokens) + max_num_tokens=max_num_tokens, + prompt_embedding_table_size=prompt_embedding_table_size, + ) - return (model_inputs['input_ids'], model_inputs['position_ids'], True, - model_inputs['last_token_ids'], model_inputs['attention_mask'], - KeyValueCacheParams( - past_key_value=model_inputs['past_key_value'], - host_past_key_value_lengths=model_inputs[ - 'host_past_key_value_lengths'], - kv_cache_block_pointers=model_inputs[ - 'kv_cache_block_pointers_list'], - cache_indirection=model_inputs['cache_indirection'], - ), - AttentionParams( - sequence_length=model_inputs['sequence_length'], - context_lengths=model_inputs['context_lengths'], - host_context_lengths=model_inputs['host_context_lengths'], - max_context_length=max_input_len, - host_request_types=model_inputs['host_request_types']), - model_inputs['hidden_states_input'], - model_inputs['all_reduce_workspace']) + return ( + model_inputs['input_ids'], + model_inputs['position_ids'], + True, + model_inputs['last_token_ids'], + model_inputs['attention_mask'], + KeyValueCacheParams( + past_key_value=model_inputs['past_key_value'], + host_past_key_value_lengths=model_inputs[ + 'host_past_key_value_lengths'], + kv_cache_block_pointers=model_inputs[ + 'kv_cache_block_pointers_list'], + cache_indirection=model_inputs['cache_indirection'], + ), + AttentionParams( + sequence_length=model_inputs['sequence_length'], + context_lengths=model_inputs['context_lengths'], + host_context_lengths=model_inputs['host_context_lengths'], + max_context_length=max_input_len, + host_request_types=model_inputs['host_request_types']), + model_inputs['hidden_states_input'], + model_inputs['all_reduce_workspace'], + model_inputs['prompt_embedding_table'], + model_inputs['tasks'], + model_inputs['prompt_vocab_size'], + ) diff --git a/tensorrt_llm/models/opt/model.py b/tensorrt_llm/models/opt/model.py index 1c31c98e25..f2469c9729 100644 --- a/tensorrt_llm/models/opt/model.py +++ b/tensorrt_llm/models/opt/model.py @@ -291,7 +291,7 @@ class OPTLMHeadModel(OPTModel, GenerationMixin): max_new_tokens, use_cache, max_beam_width, - prompt_embedding_table_size=32): + prompt_embedding_table_size: int = 0): '''@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes. diff --git a/tensorrt_llm/models/quantized/ammo.py b/tensorrt_llm/models/quantized/ammo.py index 9747a6c612..cce07f4279 100644 --- a/tensorrt_llm/models/quantized/ammo.py +++ b/tensorrt_llm/models/quantized/ammo.py @@ -27,6 +27,29 @@ except ImportError: from ...logger import logger +def _register_falcon_linears(model): + """Register Falcon linear modules as Quantiation. + + As falcon models could use remote code, which will be loaded dynamically, + to build their model. Therefore, we need to register the linear on the fly + before quantization. + + """ + if type(model).__name__ in ["RWForCausalLM", "FalconForCausalLM"]: + from ammo.torch.quantization import tensor_quant + from ammo.torch.quantization.nn.modules.quant_module import \ + QuantLinearConvBase + + linear_type = type(model.transformer.h[0].self_attention.dense) + + class QuantFalconLinearRW1B(linear_type, + QuantLinearConvBase): # type: ignore + default_quant_desc_weight = tensor_quant.QUANT_DESC_8BIT_LINEAR_WEIGHT_PER_ROW + + atq.module_mapping.QUANT_MODULE_MAPPING[ + linear_type] = QuantFalconLinearRW1B.convert + + def _quantize_model(model: torch.nn.Module, qformat: Literal['fp8', 'int8_sq', 'int4_awq'], calib_dataloader: DataLoader, @@ -51,6 +74,8 @@ def _quantize_model(model: torch.nn.Module, logger.debug(f"Calibrating batch {idx}") model(data) + _register_falcon_linears(model) + logger.debug("Starting quantization...") atq.quantize(model, quant_cfg, forward_loop=calibrate_loop) logger.debug("Quantization done") diff --git a/tensorrt_llm/models/quantized/quant.py b/tensorrt_llm/models/quantized/quant.py index e35772802e..246eee0421 100644 --- a/tensorrt_llm/models/quantized/quant.py +++ b/tensorrt_llm/models/quantized/quant.py @@ -12,13 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Union +from typing import Any, Union import numpy as np from ...layers import ColumnLinear, RowLinear -from ...models import (BloomForCausalLM, FalconForCausalLM, GPTJForCausalLM, - GPTLMHeadModel, LLaMAForCausalLM) +from ...models import (BaichuanForCausalLM, BloomForCausalLM, FalconForCausalLM, + GPTJForCausalLM, GPTLMHeadModel, InternLMForCausalLM, + LLaMAForCausalLM) +from ...module import Module from ...quantization import QuantMode from ...quantization.layers import FP8Linear, FP8RowLinear @@ -68,7 +70,6 @@ def _smooth_quantize_gpt(model, quant_mode): dtype=layer.dtype, quant_mode=quant_mode) - setattr(model, 'quant_mode', quant_mode) return model @@ -113,7 +114,6 @@ def _smooth_quantize_llama(model, quant_mode): dtype=layer.dtype, quant_mode=quant_mode) - setattr(model, 'quant_mode', quant_mode) return model @@ -160,24 +160,78 @@ def _smooth_quantize_bloom(model, quant_mode): return model -def smooth_quantize(model, quant_mode): +def _smooth_quantize_baichuan(model, quant_mode): + # Baichuan models' structures are similar to LLaMA so we can reuse the impl + return _smooth_quantize_llama(model, quant_mode) + + +def _smooth_quantize_internlm(model, quant_mode): + assert quant_mode.has_act_and_weight_quant() + for layer in model.layers: + assert hasattr(layer, + "input_layernorm"), "The layer has no input_layernorm" + layer.input_layernorm = SmoothQuantRmsNorm( + normalized_shape=layer.hidden_size, + dtype=layer.dtype, + quant_mode=quant_mode) + assert hasattr(layer, "attention"), "The layer has no attention" + layer.attention = SmoothQuantAttention( + layer.hidden_size, + num_attention_heads=layer.num_attention_heads, + num_kv_heads=layer.num_kv_heads, + max_position_embeddings=layer.max_position_embeddings, + num_layers=model.num_layers, + dtype=layer.dtype, + attention_mask_type=layer.attention_mask_type, + position_embedding_type=layer.position_embedding_type, + tp_group=layer.tp_group, + tp_size=layer.tp_size, + quant_mode=quant_mode, + bias=model.attn_bias) + + assert hasattr(layer, "mlp"), "The layer has no mlp" + layer.mlp = SmoothQuantGatedMLP(hidden_size=model.hidden_size, + ffn_hidden_size=layer.mlp_hidden_size, + hidden_act=layer.hidden_act, + dtype=layer.dtype, + tp_group=layer.tp_group, + tp_size=layer.tp_size, + quant_mode=quant_mode, + bias=False) + assert hasattr( + layer, + "post_layernorm"), "The layer has no post_rmspost_layernormnorm" + layer.post_layernorm = SmoothQuantRmsNorm( + normalized_shape=layer.hidden_size, + dtype=layer.dtype, + quant_mode=quant_mode) + + setattr(model, 'quant_mode', quant_mode) + return model + + +def _smooth_quantize(model, quant_mode): assert isinstance(model, GPTLMHeadModel) or isinstance(model, LLaMAForCausalLM) \ - or isinstance(model, BloomForCausalLM),\ - "Only GPTLMHeadModel, LLaMAForCausalLM and BloomForCausalLM are well tested now" + or isinstance(model, BloomForCausalLM) or isinstance(model, BaichuanForCausalLM) or isinstance(model, InternLMForCausalLM), \ + "Only GPTLMHeadModel, LLaMAForCausalLM BloomForCausalLM, InternLMForCausalLM and BaichuanForCausalLM are well tested now" if isinstance(model, GPTLMHeadModel): return _smooth_quantize_gpt(model, quant_mode) elif isinstance(model, LLaMAForCausalLM): return _smooth_quantize_llama(model, quant_mode) elif isinstance(model, BloomForCausalLM): return _smooth_quantize_bloom(model, quant_mode) + elif isinstance(model, BaichuanForCausalLM): + return _smooth_quantize_baichuan(model, quant_mode) + elif isinstance(model, InternLMForCausalLM): + return _smooth_quantize_internlm(model, quant_mode) else: assert False, f"Model {type(model).__name__} is not supported by SmoothQuant yet" -def weight_only_quantize(model, - quant_mode, - exclude_modules=None, - current_key_name=None): +def _weight_only_quantize(model, + quant_mode, + exclude_modules=None, + current_key_name=None): assert quant_mode.is_weight_only() exclude_modules = ['lm_head' @@ -189,8 +243,8 @@ def weight_only_quantize(model, current_key_name.append(name) if len(list(module.children())) > 0: - weight_only_quantize(module, quant_mode, exclude_modules, - current_key_name) + _weight_only_quantize(module, quant_mode, exclude_modules, + current_key_name) if isinstance(module, ColumnLinear) and name not in exclude_modules: if not any(key in '.'.join(current_key_name) @@ -218,18 +272,16 @@ def weight_only_quantize(model, current_key_name.pop(-1) - setattr(model, 'quant_mode', quant_mode) - return model -def weight_only_groupwise_quantize(model, - quant_mode, - group_size=128, - pre_quant_scale=False, - zero=False, - exclude_modules=None, - current_key_name=None): +def _weight_only_groupwise_quantize(model, + quant_mode, + group_size=128, + pre_quant_scale=False, + zero=False, + exclude_modules=None, + current_key_name=None): exclude_modules = ['lm_head' ] if exclude_modules is None else exclude_modules @@ -239,9 +291,9 @@ def weight_only_groupwise_quantize(model, current_key_name.append(name) if len(list(module.children())) > 0: - weight_only_groupwise_quantize(module, quant_mode, group_size, - pre_quant_scale, zero, - exclude_modules, current_key_name) + _weight_only_groupwise_quantize(module, quant_mode, group_size, + pre_quant_scale, zero, + exclude_modules, current_key_name) if isinstance(module, ColumnLinear) and name not in exclude_modules: if not any(key in '.'.join(current_key_name) @@ -273,8 +325,21 @@ def weight_only_groupwise_quantize(model, current_key_name.pop(-1) - setattr(model, 'quant_mode', quant_mode) + return model + +def quantize_model(model: Module, quant_mode: QuantMode, **kwargs: Any): + if quant_mode.has_fp8_qdq() or quant_mode.has_fp8_kv_cache(): + model = _fp8_quantize(model, quant_mode, **kwargs) + elif quant_mode.has_act_and_weight_quant(): + model = _smooth_quantize(model, quant_mode) + elif quant_mode.is_weight_only(): + if quant_mode.has_per_group_scaling(): + model = _weight_only_groupwise_quantize(model, quant_mode, **kwargs) + else: + model = _weight_only_quantize(model, quant_mode, **kwargs) + + setattr(model, "quant_mode", quant_mode) return model @@ -364,7 +429,7 @@ def _default_fp8_quantize(model: Union[GPTLMHeadModel, LLaMAForCausalLM, return model -def fp8_quantize(model, quant_mode: QuantMode, quant_scales: dict = None): +def _fp8_quantize(model, quant_mode: QuantMode, quant_scales: dict = None): if isinstance( model, (FalconForCausalLM, GPTJForCausalLM, GPTLMHeadModel, LLaMAForCausalLM)): diff --git a/tensorrt_llm/parameter.py b/tensorrt_llm/parameter.py index 69fe514804..b14a2953fa 100644 --- a/tensorrt_llm/parameter.py +++ b/tensorrt_llm/parameter.py @@ -12,21 +12,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Sequence, Union +import math +from typing import Optional, Sequence, Union import numpy as np import tensorrt as trt +import torch from ._utils import str_dtype_to_trt, torch_to_numpy, trt_dtype_to_torch from .functional import Tensor, constant from .logger import logger -class Parameter(object): +class Parameter: _DEFAULT_DTYPE = trt.DataType.FLOAT def __init__(self, - value: Union[np.ndarray] = None, + value: Optional[Union[np.ndarray, torch.Tensor]] = None, shape: Sequence[int] = None, dtype: Union[str, trt.DataType] = None): if dtype is None: @@ -37,11 +39,10 @@ class Parameter(object): if isinstance(dtype, str): dtype = str_dtype_to_trt(dtype) if value is None: - import torch assert isinstance(shape, (list, tuple)) if len(shape) == 2: # Xavier initialization see https://paperswithcode.com/method/xavier-initialization - v_range = np.sqrt(6) / np.sqrt(shape[0] + shape[1]) + v_range = math.sqrt(6) / math.sqrt(shape[0] + shape[1]) else: v_range = 0.1 @@ -56,9 +57,8 @@ class Parameter(object): (shape), dtype=trt_dtype_to_torch(dtype), device='cuda') * 2 - 1 # value ~ U[-v_range, v_range] - value = torch_to_numpy((value * v_range).cpu()) - - self._value = value + value = value * v_range + self._value = self._regularize_value(value) @property def value(self) -> Tensor: @@ -75,13 +75,21 @@ class Parameter(object): return self._value @value.setter - def value(self, v: np.ndarray): - assert isinstance(v, np.ndarray) + def value(self, v: Union[np.ndarray, torch.Tensor]): + v = self._regularize_value(v) assert v.shape == self._value.shape, \ - ('The value updated is not the same shape as the original. ', \ - f'Updated: {v.shape}, original: {self._value.shape}') + f'The value updated is not the same shape as the original. ' \ + f'Updated: {v.shape}, original: {self._value.shape}' self._value = v def _get_weights(self) -> trt.Weights: return self._value.producer.weights if isinstance(self._value, Tensor) else None + + def _regularize_value(self, value): + if isinstance(value, np.ndarray): + return value + elif isinstance(value, torch.Tensor): + return torch_to_numpy(value) + raise TypeError( + f'Expected numpy.ndarray or torch.Tensor, got {type(value)}') diff --git a/tensorrt_llm/profiler.py b/tensorrt_llm/profiler.py index 36803b7a43..25c296f425 100644 --- a/tensorrt_llm/profiler.py +++ b/tensorrt_llm/profiler.py @@ -13,6 +13,20 @@ # See the License for the specific language governing permissions and # limitations under the License. import time +from functools import partial +from typing import Literal, Optional, Union + +try: + import psutil +except ImportError: + psutil = None +try: + import pynvml +except ImportError: + pynvml = None +import torch + +from tensorrt_llm.logger import logger class Timer: @@ -41,9 +55,9 @@ class Timer: self._total_elapsed_times.clear() def summary(self): - print('Profile Results') + logger.info('Profile Results') for tag, elapsed_time in self._total_elapsed_times.items(): - print(f' - {tag.ljust(30, ".")}: {elapsed_time:.6f} (sec)') + logger.info(f' - {tag.ljust(30, ".")}: {elapsed_time:.6f} (sec)') _default_timer = Timer() @@ -67,3 +81,189 @@ def reset(): def summary(): _default_timer.summary() + + +_pynvml_initialized = False + + +def initialize_pynvml(): + global _pynvml_initialized + if pynvml is not None and not _pynvml_initialized: + pynvml.nvmlInit() + _pynvml_initialized = True + + +def finalize_pynvml(): + global _pynvml_initialized + if pynvml is not None and _pynvml_initialized: + pynvml.nvmlInvmlShutdownnit() + _pynvml_initialized = False + + +class MemoryMonitor: + + TAG = '[MemUsage]' + UnitType = Literal['GiB', 'MiB', 'KiB'] + units = {'GiB': 1 << 30, 'MiB': 1 << 20, 'KiB': 1 << 10} + # For convenience. + _rename_map = {'GB': 'GiB', 'MB': 'MiB', 'KiB': 'KB'} + + _maybe_warned = False + + def __init__(self): + # bytes + self._peak_host_memory = 0 + self._peak_device_memory = 0 + self._check_required_packages() + + self.device_handles = {} + initialize_pynvml() + + if pynvml.__version__ < '11.5.0': + logger.warning(f'Found pynvml=={pynvml.__version__}. Please use ' + f'pynvml>=11.5.0 to get accurate memory usage') + # Support legacy pynvml. Note that an old API could return + # wrong GPU memory usage. + self._device_mem__fn = pynvml.nvmlDeviceGetMemoryInfo + else: + self._device_mem__fn = partial(pynvml.nvmlDeviceGetMemoryInfo, + version=pynvml.nvmlMemory_v2) + + @classmethod + def _check_required_packages(cls): + if cls._maybe_warned: + return + if psutil is None: + # Warning once. + logger.warning( + "A required package 'psutil' is not installed. Will not " + "monitor the host memory usages. Please install the package " + "first, e.g, 'pip install psutil'.") + return + if pynvml is None: + # Warning once. + logger.warning( + "A required package 'psutil' is not installed. Will not " + "monitor the host memory usages. Please install the package " + "first, e.g, 'pip install pynvml>=11.5.0'.") + cls._maybe_warned = True + + def host_memory_info(self) -> int: + process = psutil.Process() + # USS reports the amount of memory that would be freed if the process + # was terminated right now. + # https://psutil.readthedocs.io/en/latest/index.html#psutil.Process.memory_full_info + vmem = psutil.virtual_memory() + total_mem = vmem.total + free_mem = vmem.available + alloc_mem = process.memory_full_info().uss + if alloc_mem > self._peak_host_memory: + self._peak_host_memory = alloc_mem + return alloc_mem, free_mem, total_mem + + def device_memory_info( + self, + device: Optional[Union[torch.device, int]] = None, + ) -> int: + index = torch._utils._get_device_index(device, optional=True) + if index not in self.device_handles: + handle = pynvml.nvmlDeviceGetHandleByIndex(index) + self.device_handles[index] = handle + mem_info = self._device_mem__fn(self.device_handles[index]) + if mem_info.used > self._peak_device_memory: + self._peak_device_memory = mem_info.used + return mem_info.used, mem_info.free, mem_info.total + + @staticmethod + def _normalize_unit_name(unit: str): + # Rename GB -> GiB. + return {'GB': 'GiB', 'MB': 'MiB', 'KiB': 'KB'}[unit] + + @classmethod + def _format(cls, mem_bytes: int, unit: UnitType) -> str: + if unit not in cls.units: + unit = cls._rename_map[unit] + mem_usage = float(mem_bytes) / cls.units[unit] + return f'{mem_usage:.4f} ({unit})' + + @classmethod + def _print_message(cls, msg: str, tag: Optional[str] = None): + if tag: + msg = f'{tag} - {msg}' + logger.info(f'{cls.TAG} {msg}') + + def print_host_memory_usage(self, + tag: Optional[str] = None, + unit: UnitType = 'GiB'): + if psutil is None: + return + alloc_mem, _, _ = self.host_memory_info() + msg = f'Allocated Host Memory {self._format(alloc_mem, unit)}' + self._print_message(msg, tag) + + def print_device_memory_usage( + self, + tag: Optional[str] = None, + unit: UnitType = 'GB', + device: Optional[Union[torch.device, int]] = None, + ): + alloc_mem, _, _ = self.device_memory_info(device) + msg = f'Allocated Device Memory {self._format(alloc_mem, unit)}' + self._print_message(msg, tag) + + def print_memory_usage( + self, + tag: Optional[str] = None, + unit: UnitType = 'GiB', + device: Optional[Union[torch.device, int]] = None, + ): + alloc_host_mem, _, _ = self.host_memory_info() + alloc_device_mem, _, _ = self.device_memory_info(device=device) + msg = f'Allocated Memory: Host {self._format(alloc_host_mem, unit)} '\ + f'Device {self._format(alloc_device_mem, unit)}' + self._print_message(msg, tag) + + def print_peak_memory_usage(self, unit: UnitType = 'GiB'): + self._print_message( + f'Peak Memory Usage: ' + f'Host {self._format(self._peak_host_memory, unit)} ' + f'Device {self._format(self._peak_device_memory, unit)}') + + +if psutil is not None and pynvml is not None: + _default_memory_monitor = MemoryMonitor() +else: + _default_memory_monitor = None + + +def host_memory_info(): + if _default_memory_monitor is not None: + return _default_memory_monitor.host_memory_info() + + +def device_memory_info(device: Optional[Union[torch.device, int]] = None): + if _default_memory_monitor is not None: + return _default_memory_monitor.device_memory_info(device) + + +def print_host_memory_usage(tag: Optional[str] = None, + unit: MemoryMonitor.UnitType = 'GiB'): + if _default_memory_monitor is not None: + _default_memory_monitor.print_host_memory_usage(tag=tag, unit=unit) + + +def print_device_memory_usage(tag: Optional[str] = None, + unit: MemoryMonitor.UnitType = 'GiB'): + if _default_memory_monitor is not None: + _default_memory_monitor.print_device_memory_usage(tag=tag, unit=unit) + + +def print_memory_usage(tag: Optional[str] = None, + unit: MemoryMonitor.UnitType = 'GiB'): + if _default_memory_monitor is not None: + _default_memory_monitor.print_memory_usage(tag=tag, unit=unit) + + +def print_peak_memory_usage(unit: MemoryMonitor.UnitType = 'GiB'): + if _default_memory_monitor is not None: + _default_memory_monitor.print_peak_memory_usage(unit=unit) diff --git a/tensorrt_llm/runtime/__init__.py b/tensorrt_llm/runtime/__init__.py index 703f1f9456..d1075424d7 100644 --- a/tensorrt_llm/runtime/__init__.py +++ b/tensorrt_llm/runtime/__init__.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .generation import (ChatGLM6BHeadModelGenerationSession, GenerationSession, +from .generation import (ChatGLMGenerationSession, GenerationSession, ModelConfig, SamplingConfig, to_word_list_format) from .kv_cache_manager import GenerationSequence, KVCacheManager from .session import Session, TensorInfo @@ -25,6 +25,6 @@ __all__ = [ 'SamplingConfig', 'Session', 'TensorInfo', - 'ChatGLM6BHeadModelGenerationSession', + 'ChatGLMGenerationSession', 'to_word_list_format', ] diff --git a/tensorrt_llm/runtime/generation.py b/tensorrt_llm/runtime/generation.py index 687bbb2851..6606afdc6e 100755 --- a/tensorrt_llm/runtime/generation.py +++ b/tensorrt_llm/runtime/generation.py @@ -249,7 +249,7 @@ class ModelConfig: has_position_embedding: bool = True has_token_type_embedding: bool = False tokens_per_block: int = 64 - use_prompt_tuning: bool = False + max_prompt_embedding_table_size: int = 0 quant_mode: QuantMode = QuantMode(0) gather_all_token_logits: bool = False dtype: str = "" @@ -402,7 +402,7 @@ class GenerationSession(object): 'attention_mask', ] - if model_config.use_prompt_tuning: + if model_config.max_prompt_embedding_table_size > 0: expected_tensor_names += [ 'prompt_embedding_table', 'tasks', 'prompt_vocab_size' ] @@ -1656,6 +1656,7 @@ class GenerationSession(object): next_step_buffer = None attention_mask = None context_logits = None + generation_logits = [] def get_outputs_dict(output_ids): outputs = {} @@ -1666,6 +1667,7 @@ class GenerationSession(object): [batch_size, beam_width]) if self.gather_all_token_logits: outputs['context_logits'] = context_logits + outputs['generation_logits'] = generation_logits return outputs for step in range(0, self.max_new_tokens): @@ -1680,6 +1682,10 @@ class GenerationSession(object): encoder_input_lengths) if step == 0: context_logits = logits + if self.gather_all_token_logits: + generation_logits.append( + next_step_buffer['logits'].clone().detach()) + if should_stop is not None and should_stop.item(): final_output_ids = self.finalize_decoder( context_lengths, batch_size, beam_width, scfg) @@ -1783,12 +1789,14 @@ class GenerationSession(object): def decode_batch(self, input_ids: Sequence[torch.Tensor], sampling_config: SamplingConfig, - streaming: bool = False): + streaming: bool = False, + **kwargs): input_ids, context_lengths = _prepare_input_ids(input_ids) return self.decode(input_ids, context_lengths, sampling_config, - streaming=streaming) + streaming=streaming, + **kwargs) # As dynamic_decoder uses torch's current stream, we must ensure it runs on the same stream that # dynamic_decoder was set up with @@ -1907,24 +1915,42 @@ class GenerationSession(object): encoder_output, encoder_input_lengths) -class ChatGLM6BHeadModelGenerationSession(GenerationSession): +class ChatGLMGenerationSession(GenerationSession): def _prepare_context_inputs(self, batch_size, context_lengths, use_gpt_attention_plugin, remove_input_padding, **kwargs): - assert not remove_input_padding last_token_ids = context_lengths.detach().clone() max_context_length = kwargs.pop('max_context_length') - position_ids = torch.zeros([batch_size, 2, max_context_length], - dtype=torch.int32) - position_ids[:, 0, :] = torch.arange(max_context_length) - for i in range(batch_size): - length = context_lengths[i] - position_ids[i, 0, length - 1] = length - 2 - position_ids[i, 1, length - 1] = 1 - position_ids[i, :, length:] = 0 - position_ids = position_ids.cuda() + + if remove_input_padding: + input_lengths_acc = torch.cumsum(torch.cat( + [torch.IntTensor([0]).cuda(), context_lengths], dim=0), + dim=0) + position_ids = torch.zeros([1, 2, input_lengths_acc[-1]], + dtype=torch.int32) + for i in range(batch_size): + position_ids[0, 0, input_lengths_acc[i]:input_lengths_acc[ + i + 1]] = torch.arange(0, + context_lengths[i], + dtype=torch.int32) + position_ids[0, 0, input_lengths_acc[i + 1] - + 1] = context_lengths[i] - 2 + position_ids[0, 1, input_lengths_acc[i + 1] - 1] = 1 + position_ids = position_ids.int().cuda() + last_token_ids = torch.cumsum(last_token_ids, dim=0).int().cuda() + else: + position_ids = torch.zeros([batch_size, 2, max_context_length], + dtype=torch.int32) + position_ids[:, 0, :] = torch.arange(max_context_length) + for i in range(batch_size): + length = context_lengths[i] + position_ids[i, 0, length - 1] = length - 2 + position_ids[i, 1, length - 1] = 1 + position_ids[i, :, length:] = 0 + position_ids = position_ids.cuda() + inputs = { 'position_ids': position_ids, 'last_token_ids': last_token_ids @@ -1937,17 +1963,25 @@ class ChatGLM6BHeadModelGenerationSession(GenerationSession): def _prepare_generation_inputs(self, batch_size, context_lengths, use_gpt_attention_plugin, remove_input_padding, **kwargs): - assert not remove_input_padding - last_token_ids = torch.ones_like(context_lengths) step = kwargs.pop('step') num_beams = kwargs.pop('num_beams') + last_token_ids = torch.ones_like(context_lengths) - data = [] - for i in range(batch_size): - data.append([[context_lengths[i * num_beams] - 2], [step + 2]]) - position_ids = torch.tensor(data, dtype=torch.int32, device='cuda') - position_ids = _tile_beam_width(position_ids, num_beams) + if remove_input_padding: + position_ids = torch.zeros([1, 2, batch_size], dtype=torch.int32) + for i in range(batch_size): + position_ids[0, 0, i] = context_lengths[i * num_beams] - 2 + position_ids[0, 1, i] = step + 2 + position_ids = _tile_beam_width(position_ids, num_beams) + position_ids = position_ids.int().cuda() + last_token_ids = torch.cumsum(last_token_ids, dim=0).int().cuda() + else: + data = [] + for i in range(batch_size): + data.append([[context_lengths[i * num_beams] - 2], [step + 2]]) + position_ids = torch.tensor(data, dtype=torch.int32, device='cuda') + position_ids = _tile_beam_width(position_ids, num_beams) inputs = { 'position_ids': position_ids, diff --git a/tensorrt_llm/tools/__init__.py b/tensorrt_llm/tools/__init__.py index e69de29bb2..96a0c34d85 100644 --- a/tensorrt_llm/tools/__init__.py +++ b/tensorrt_llm/tools/__init__.py @@ -0,0 +1,16 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .gc_helper import cleanup # noqa diff --git a/tensorrt_llm/tools/gc_helper.py b/tensorrt_llm/tools/gc_helper.py new file mode 100644 index 0000000000..843b0d8d15 --- /dev/null +++ b/tensorrt_llm/tools/gc_helper.py @@ -0,0 +1,12 @@ +from ..module import Module +from ..network import Network + + +def cleanup(network: Network, model: Module): + # TODO: A quick fix for the memory leak caused by Parameter. + # Remove this method once the issue fixed in a proper way. + for _, param in model.named_parameters(): + # param._value captures the numpy array so that gc can't collect + # those buffers. + param._value = None + network._registered_ndarrays = None diff --git a/tensorrt_llm/tools/plugin_gen/core.py b/tensorrt_llm/tools/plugin_gen/core.py index 910e17f97d..034ba6431a 100644 --- a/tensorrt_llm/tools/plugin_gen/core.py +++ b/tensorrt_llm/tools/plugin_gen/core.py @@ -269,7 +269,7 @@ class KernelMetaData: if yaml_path: with open(yaml_path, "r") as f: yaml_str = f.read() - yaml_data = yaml.load(yaml_str, Loader=yaml.Loader) + yaml_data = yaml.load(yaml_str, Loader=yaml.SafeLoader) kernel_name = yaml_data["name"] ios = [] @@ -682,13 +682,12 @@ class PluginCmakeCodegen: def setup_jinja_env() -> jinja2.Environment: - env = jinja2.Environment( - loader=jinja2.PackageLoader( - package_name="tensorrt_llm.tools.plugin_gen", - package_path="templates", - ), - undefined=jinja2.StrictUndefined, - ) + env = jinja2.Environment(loader=jinja2.PackageLoader( + package_name="tensorrt_llm.tools.plugin_gen", + package_path="templates", + ), + undefined=jinja2.StrictUndefined, + autoescape=jinja2.select_autoescape()) env.variable_start_string = '[[' env.variable_end_string = ']]' return env diff --git a/tensorrt_llm/tools/plugin_gen/plugin_gen.py b/tensorrt_llm/tools/plugin_gen/plugin_gen.py index 2095c6cd8b..83aaea539e 100644 --- a/tensorrt_llm/tools/plugin_gen/plugin_gen.py +++ b/tensorrt_llm/tools/plugin_gen/plugin_gen.py @@ -296,14 +296,14 @@ def _mkdir(path: str): ''' mkdir if not exists ''' - subprocess.run(['mkdir', '-p', path], check=True) + subprocess.run(['/usr/bin/mkdir', '-p', path], check=True) def _rmdir(path: str): ''' rmdir if exists ''' - subprocess.run(['rm', '-rf', path], check=True) + subprocess.run(['/usr/bin/rm', '-rf', path], check=True) def _run_command(args, cwd=None): diff --git a/tensorrt_llm/tools/ppl.py b/tensorrt_llm/tools/ppl.py new file mode 100644 index 0000000000..4fd4d67753 --- /dev/null +++ b/tensorrt_llm/tools/ppl.py @@ -0,0 +1,7 @@ +def ppl(logits, output_ids): + """ + Calculate per-token perplexity. + """ + nlls = -logits.log_softmax(dim=-1) + ppls = nlls.gather(-1, output_ids.long().unsqueeze(-1)) + return ppls.mean().exp().item() diff --git a/tests/attention/test_gpt_attention.py b/tests/attention/test_gpt_attention.py index e09c28ebc7..a706e8ee0b 100644 --- a/tests/attention/test_gpt_attention.py +++ b/tests/attention/test_gpt_attention.py @@ -427,11 +427,16 @@ class TestFunctional(unittest.TestCase): stream = torch.cuda.current_stream() # NOTE: when 8-bit kv cache is used together with paged kv cache no 8-bit tensors are exposed to TRT int8_trt_flag = use_int8_kv_cache and not paged_kv_cache - fp8_trt_flag = use_fp8_kv_cache and not paged_kv_cache - builder_config = builder.create_builder_config(name=attention_type, - precision=dtype, - int8=int8_trt_flag, - fp8=fp8_trt_flag) + use_fp8_kv_cache and not paged_kv_cache + quant_mode = QuantMode.from_description( + use_fp8_kv_cache=use_fp8_kv_cache + ) if use_fp8_kv_cache and not paged_kv_cache else QuantMode(0) + builder_config = builder.create_builder_config( + name=attention_type, + precision=dtype, + int8=int8_trt_flag, + quant_mode=quant_mode) + if session is None: engine = builder.build_engine(net, builder_config) session = tensorrt_llm.runtime.Session.from_serialized_engine( diff --git a/tests/attention/test_gpt_attention_IFB.py b/tests/attention/test_gpt_attention_IFB.py index 1726c05992..55c3245722 100644 --- a/tests/attention/test_gpt_attention_IFB.py +++ b/tests/attention/test_gpt_attention_IFB.py @@ -71,53 +71,59 @@ class TestFunctional(unittest.TestCase): test_cases = [] test_cases += list( product(['gpt2_attention', 'llama_attention', 'gptj_attention'], - [ContextFMHAType.disabled], ['float16'], [2], [128], [4], - [64], [0], [False], [False], [1], [True, False])) + [ContextFMHAType.disabled], ['float16'], [2], [128], [8], + [4], [64], [0], [False], [False], [1], [True, False])) # TODO: add more unit tests test_cases += list( product(['llama_attention'], [ ContextFMHAType.disabled, ContextFMHAType.enabled, ContextFMHAType.enabled_with_fp32_acc - ], ['float16'], [2], [90], [4], [32], [0], [False], [False], [1], - [False])) + ], ['float16'], [2], [90], [8], [4], [32], [0], [False], [False], + [1], [False])) # Test cases for the multi-block MMHA. test_cases += list( product(['llama_attention'], [ ContextFMHAType.enabled, ContextFMHAType.enabled_with_fp32_acc - ], ['float16', 'float32'], [2], [2048], [4], [64], [0], [True], + ], ['float16', 'float32'], [2], [2048], [8], [4], [64], [0], [True], [False], [1], [True, False])) + test_cases += list( + product(['llama_attention'], + [ContextFMHAType.enabled_with_fp32_acc], ['float16'], [16], + [2048], [32], [4], [64], [0], [True], [False], [1], + [False])) # Test cases for the int8 K/V cache. test_cases += list( product(['gpt2_attention'], [ContextFMHAType.disabled], - ['float16', 'float32'], [2], [128], [4], [64], [0], [False], - [True], [1], [False])) + ['float16', 'float32'], [2], [128], [8], [4], [64], [0], + [False], [True], [1], [False])) # test cases for multi-query attention test_cases += list( product(['gpt_bigcode_attention'], [ ContextFMHAType.disabled, ContextFMHAType.enabled, ContextFMHAType.enabled_with_fp32_acc - ], ['float16'], [2], [128], [4], [64], [1], [False], [False], [1], - [False])) + ], ['float16'], [2], [128], [8], [4], [64], [1], [False], [False], + [1], [False])) # test cases for beam search test_cases += list( product(['gpt2_attention'], [ContextFMHAType.disabled], ['float16'], - [2], [128], [4], [64], [0], [False], [False], [4], [False])) + [2], [128], [8], [4], [64], [0], [False], [False], [4], + [False])) # test cases for grouped-query attention test_cases += list( product(['llama_attention'], [ContextFMHAType.disabled], - ['float16'], [2], [128], [8], [32], [2, 4], [False], + ['float16'], [2], [128], [8], [8], [32], [2, 4], [False], [False], [1], [False])) # test cases for rotary scaling test_cases += list( product(['llama_attention'], [ContextFMHAType.disabled], - ['float32'], [2], [128], [8], [32], [2, 8], [False], + ['float32'], [2], [128], [8], [8], [32], [2, 8], [False], [False], [1], [False], [10000.0, 1000000.0], [ { "type": "linear", @@ -143,6 +149,7 @@ class TestFunctional(unittest.TestCase): dtype, batch_size, in_len, + out_len, num_heads, head_size, num_kv_heads, @@ -376,9 +383,9 @@ class TestFunctional(unittest.TestCase): plugin_kv_num_heads = num_kv_heads if attention_type == 'llama_attention' or attention_type == 'gpt_bigcode_attention' else num_heads kv_hidden_size = plugin_kv_num_heads * head_size qkv_hidden_size = hidden_size + 2 * kv_hidden_size - out_len = 8 - max_seq_len = in_len + 24 + max_seq_len = in_len + out_len * 3 num_req = batch_size + in_lens = torch.randint(1, in_len + 1, (num_req, )) max_blocks_per_seq = math.ceil(max_seq_len / tokens_per_block) blocks = math.ceil( (num_req * beam_width * max_seq_len) / tokens_per_block) @@ -413,8 +420,10 @@ class TestFunctional(unittest.TestCase): cache_indirection = torch.zeros(shape_dict['cache_indirection'], dtype=torch.int32, device='cuda') - for iteration in range(1, beam_width): - cache_indirection[:, iteration, in_len:] = iteration + for req_idx in range(num_req): + in_len_req = in_lens[req_idx] + for iteration in range(1, beam_width): + cache_indirection[req_idx, iteration, in_len_req:] = iteration kv_int8_dequant_scale = torch.randint( 1, @@ -612,11 +621,13 @@ class TestFunctional(unittest.TestCase): def torch_exec(step: int, input: torch.Tensor, ctx_attention_mask: torch.Tensor, + req_idx: int, layer_past=None): assert layer_past != None or input.shape[0] == 1 nonlocal attention nonlocal attention_type - nonlocal in_len + nonlocal in_lens + in_len = in_lens[req_idx] position_ids = ctx_attention_mask.long().cumsum(-1) - 1 position_ids.masked_fill_(ctx_attention_mask == 0, 1) if step != 0: @@ -700,35 +711,38 @@ class TestFunctional(unittest.TestCase): batch_req_ids = [] for req_idx in reversed(range(num_req)): step = get_step(req_idx) + in_len_req = in_lens[req_idx] if is_valid_step(step): batch_req_ids.append(req_idx) if step == 0: - input_length_list.append([in_len]) - context_length_list += [in_len] + input_length_list.append([in_len_req]) + context_length_list += [in_len_req] request_type_list += [0] host_past_key_value_length_list += [0] sequence_selection += [req_idx * beam_width] num_context_req += 1 else: input_length_list.append([1] * beam_width) - context_length_list += [in_len] * beam_width + context_length_list += [in_len_req] * beam_width request_type_list += [1] * beam_width - host_past_key_value_length_list += [in_len + step - 1 - ] * beam_width + host_past_key_value_length_list += [ + in_len_req + step - 1 + ] * beam_width num_generation_req += 1 sequence_selection += list( range(req_idx * beam_width, (req_idx + 1) * beam_width)) - sequence_length_list += [in_len + step] * beam_width + sequence_length_list += [in_len_req + step] * beam_width num_seq = num_context_req + num_generation_req * beam_width # Check if new sequence arrived if iteration < num_req: + in_len_req = in_lens[iteration] # Add sequence to the manager sequence = GenerationSequence(seq_idx=iteration, batch_idx=iteration) - kv_cache_manager.add_sequence(sequence, in_len) + kv_cache_manager.add_sequence(sequence, in_len_req.clone()) # Get arrays of pointers to the "pages" of KV values pointer_arrays = kv_cache_manager.get_pointer_arrays(beam_width)[0] @@ -796,13 +810,14 @@ class TestFunctional(unittest.TestCase): for req_idx in batch_req_ids: step = get_step(req_idx) assert is_valid_step(step) + in_len_req = in_lens[req_idx] if step == 0: ctx_attention_mask_list[req_idx] = torch.ones( - (1, in_len), dtype=torch.int32, device='cuda') + (1, in_len_req), dtype=torch.int32, device='cuda') else: if step == 1: ctx_attention_mask_list[req_idx] = torch.ones( - (beam_width, in_len), + (beam_width, in_len_req), dtype=torch.int32, device='cuda') ctx_attention_mask_list[req_idx] = torch.cat( @@ -831,7 +846,7 @@ class TestFunctional(unittest.TestCase): torch_in = input_tensor[:, offset:offset_next, :].reshape( (local_beam_width, input_length, hidden_size)) torch_out, torch_cache_list[req_idx] = torch_exec( - step, torch_in, ctx_attention_mask_list[req_idx], + step, torch_in, ctx_attention_mask_list[req_idx], req_idx, torch_cache_list[req_idx]) np.testing.assert_allclose( diff --git a/tests/bindings/test_bindings.py b/tests/bindings/test_bindings.py new file mode 100644 index 0000000000..1d0ffcaba2 --- /dev/null +++ b/tests/bindings/test_bindings.py @@ -0,0 +1,343 @@ +import json +import tempfile +from pathlib import Path + +import torch + +import tensorrt_llm.bindings as _tb + + +def test_generation_output(): + ids = torch.ones(1) + lengths = torch.ones(2) + gen_output = _tb.GenerationOutput(ids, lengths) + assert torch.equal(gen_output.ids, ids) + assert torch.equal(gen_output.lengths, lengths) + + assert gen_output.log_probs is None + log_probs = torch.ones(1) + gen_output.log_probs = log_probs + assert gen_output.log_probs == log_probs + + assert gen_output.context_logits is None + torch.ones(1) + gen_output.context_logits = log_probs + assert gen_output.context_logits == log_probs + + +def test_generation_input(): + end_id = 42 + pad_id = 13 + ids = torch.ones(1) + lengths = torch.ones(2) + packed = True + gen_input = _tb.GenerationInput(end_id, pad_id, ids, lengths, packed) + assert gen_input.end_id == end_id + assert gen_input.pad_id == pad_id + assert torch.equal(gen_input.ids, ids) + assert torch.equal(gen_input.lengths, lengths) + assert gen_input.packed == packed + + assert gen_input.max_new_tokens is None + max_new_tokens = 100 + gen_input.max_new_tokens = max_new_tokens + assert gen_input.max_new_tokens == max_new_tokens + + assert gen_input.embedding_bias is None + embedding_bias = torch.ones(3) + gen_input.embedding_bias = embedding_bias + assert torch.equal(gen_input.embedding_bias, embedding_bias) + + assert gen_input.prompt_tuning_params.embedding_table is None + assert gen_input.prompt_tuning_params.tasks is None + assert gen_input.prompt_tuning_params.vocab_size is None + + embedding_table = torch.ones(3) + tasks = torch.ones(2) + vocab_size = torch.ones(1) + prompt_tuning_params = _tb.PromptTuningParams( + embedding_table=embedding_table, tasks=tasks, vocab_size=vocab_size) + assert len(prompt_tuning_params.prompt_tuning_enabled) == 0 + prompt_tuning_enabled = [True, False] + prompt_tuning_params.prompt_tuning_enabled = prompt_tuning_enabled + assert len(prompt_tuning_params.prompt_tuning_enabled) == 2 + assert prompt_tuning_params.prompt_tuning_enabled == prompt_tuning_enabled + gen_input.prompt_tuning_params = prompt_tuning_params + assert gen_input.prompt_tuning_params is not None + assert torch.equal(gen_input.prompt_tuning_params.embedding_table, + embedding_table) + assert torch.equal(gen_input.prompt_tuning_params.tasks, tasks) + assert torch.equal(gen_input.prompt_tuning_params.vocab_size, vocab_size) + assert gen_input.prompt_tuning_params.prompt_tuning_enabled == prompt_tuning_enabled + + +def test_gpt_session_config(): + kv_cache_config = _tb.KvCacheConfig() + assert kv_cache_config.max_tokens is None + max_tokens = 13 + kv_cache_config.max_tokens = max_tokens + assert kv_cache_config.max_tokens == max_tokens + assert kv_cache_config.free_gpu_memory_fraction is None + free_gpu_memory_fraction = 0.5 + kv_cache_config.free_gpu_memory_fraction = free_gpu_memory_fraction + assert kv_cache_config.free_gpu_memory_fraction == free_gpu_memory_fraction + + max_batch_size = 1000 + max_beam_width = 64 + max_sequence_length = 1 << 20 + gpt_session_config = _tb.GptSessionConfig(max_batch_size, max_beam_width, + max_sequence_length) + assert gpt_session_config.max_batch_size == max_batch_size + assert gpt_session_config.max_beam_width == max_beam_width + assert gpt_session_config.max_sequence_length == max_sequence_length + + assert gpt_session_config.kv_cache_config is not None + assert gpt_session_config.kv_cache_config.max_tokens is None + assert gpt_session_config.kv_cache_config.free_gpu_memory_fraction is None + gpt_session_config.kv_cache_config = kv_cache_config + assert gpt_session_config.kv_cache_config.max_tokens == max_tokens + assert gpt_session_config.kv_cache_config.free_gpu_memory_fraction == free_gpu_memory_fraction + gpt_session_config.kv_cache_config.max_tokens = None + assert gpt_session_config.kv_cache_config.max_tokens is None + gpt_session_config.kv_cache_config.free_gpu_memory_fraction = None + assert gpt_session_config.kv_cache_config.free_gpu_memory_fraction is None + + assert not gpt_session_config.decoder_per_request + gpt_session_config.decoder_per_request = True + assert gpt_session_config.decoder_per_request + + assert not gpt_session_config.cuda_graph_mode + gpt_session_config.cuda_graph_mode = True + assert gpt_session_config.cuda_graph_mode + + assert gpt_session_config.ctx_micro_batch_size is None + ctx_micro_batch_size = 10 + gpt_session_config.ctx_micro_batch_size = ctx_micro_batch_size + assert gpt_session_config.ctx_micro_batch_size == ctx_micro_batch_size + + assert gpt_session_config.gen_micro_batch_size is None + gen_micro_batch_size = 20 + gpt_session_config.gen_micro_batch_size = gen_micro_batch_size + assert gpt_session_config.gen_micro_batch_size == gen_micro_batch_size + + +def test_quant_mode(): + assert _tb.QuantMode.none().value == 0 + assert _tb.QuantMode.int4_weights().has_int4_weights + assert _tb.QuantMode.int8_weights().has_int8_weights + assert _tb.QuantMode.activations().has_activations + assert _tb.QuantMode.per_channel_scaling().has_per_channel_scaling + assert _tb.QuantMode.per_token_scaling().has_per_token_scaling + assert _tb.QuantMode.per_group_scaling().has_per_group_scaling + assert _tb.QuantMode.int8_kv_cache().has_int8_kv_cache + assert _tb.QuantMode.fp8_kv_cache().has_fp8_kv_cache + assert _tb.QuantMode.fp8_qdq().has_fp8_qdq + + quant_mode = _tb.QuantMode.from_description(True, True, True, True, True, + True, True, True) + assert quant_mode.has_int4_weights + quant_mode -= _tb.QuantMode.int4_weights() + assert not quant_mode.has_int4_weights + quant_mode += _tb.QuantMode.int4_weights() + assert quant_mode.has_int4_weights + + assert _tb.QuantMode.none() == _tb.QuantMode.none() + + +def test_gpt_model_config(): + vocab_size = 10000 + num_layers = 12 + num_heads = 16 + hidden_size = 768 + data_type = _tb.DataType.FLOAT + gpt_model_config = _tb.GptModelConfig(vocab_size, num_layers, num_heads, + hidden_size, data_type) + assert gpt_model_config.vocab_size == vocab_size + assert gpt_model_config.num_layers() == num_layers + assert gpt_model_config.num_heads == num_heads + assert gpt_model_config.hidden_size == hidden_size + assert gpt_model_config.data_type == data_type + + assert gpt_model_config.vocab_size_padded(1) is not None + assert gpt_model_config.size_per_head == hidden_size // num_heads + + assert gpt_model_config.num_kv_heads == num_heads + num_kv_heads = 1 + gpt_model_config.num_kv_heads = num_kv_heads + assert gpt_model_config.num_kv_heads == num_kv_heads + + assert not gpt_model_config.use_gpt_attention_plugin + gpt_model_config.use_gpt_attention_plugin = True + assert gpt_model_config.use_gpt_attention_plugin + + assert not gpt_model_config.use_packed_input + gpt_model_config.use_packed_input = True + assert gpt_model_config.use_packed_input + + assert not gpt_model_config.use_paged_kv_cache + gpt_model_config.use_paged_kv_cache = True + assert gpt_model_config.use_paged_kv_cache + + assert gpt_model_config.tokens_per_block == 64 + tokens_per_block = 1024 + gpt_model_config.tokens_per_block = tokens_per_block + assert gpt_model_config.tokens_per_block == tokens_per_block + + assert gpt_model_config.quant_mode == _tb.QuantMode.none() + gpt_model_config.quant_mode = _tb.QuantMode.int4_weights() + assert gpt_model_config.quant_mode.has_int4_weights + + assert gpt_model_config.supports_inflight_batching + + assert gpt_model_config.max_batch_size == 0 + max_batch_size = 1000 + gpt_model_config.max_batch_size = max_batch_size + assert gpt_model_config.max_batch_size == max_batch_size + + assert gpt_model_config.max_input_len == 0 + max_input_len = 2048 + gpt_model_config.max_input_len = max_input_len + assert gpt_model_config.max_input_len == max_input_len + + assert gpt_model_config.max_num_tokens is None + max_num_tokens = 10000 + gpt_model_config.max_num_tokens = max_num_tokens + assert gpt_model_config.max_num_tokens == max_num_tokens + + assert not gpt_model_config.compute_context_logits + gpt_model_config.compute_context_logits = True + assert gpt_model_config.compute_context_logits + + assert gpt_model_config.model_variant == _tb.GptModelVariant.GPT + model_variant = _tb.GptModelVariant.GLM + gpt_model_config.model_variant = model_variant + assert gpt_model_config.model_variant == model_variant + + assert not gpt_model_config.use_custom_all_reduce + gpt_model_config.use_custom_all_reduce = True + assert gpt_model_config.use_custom_all_reduce + + +def test_world_config(): + tensor_parallelism = 2 + pipeline_parallelism = 4 + rank = 3 + gpus_per_node = 10 + world_config = _tb.WorldConfig(tensor_parallelism, pipeline_parallelism, + rank, gpus_per_node) + assert world_config.tensor_parallelism == tensor_parallelism + assert world_config.pipeline_parallelism == pipeline_parallelism + assert world_config.rank == rank + assert world_config.gpus_per_node == gpus_per_node + assert world_config.size == tensor_parallelism * pipeline_parallelism + assert world_config.is_pipeline_parallel + assert world_config.is_tensor_parallel + assert world_config.device == rank % gpus_per_node + assert world_config.pipeline_parallel_rank == rank // tensor_parallelism + assert world_config.tensor_parallel_rank == rank % tensor_parallelism + + world_config = _tb.WorldConfig.mpi(gpus_per_node) + assert world_config.tensor_parallelism == 1 + assert world_config.pipeline_parallelism == 1 + assert world_config.gpus_per_node == gpus_per_node + assert world_config.rank == 0 + + +def test_sampling_config(): + beam_width = 12 + sampling_config = _tb.SamplingConfig(beam_width) + assert sampling_config.beam_width == 12 + + def check_empty_then_set(member, value): + assert getattr(sampling_config, member) is None + setattr(sampling_config, member, value) + assert getattr(sampling_config, member) == value + + float_array = [1., 2., 3.] + size_t_array = [1, 2, 3] + check_empty_then_set("temperature", float_array) + check_empty_then_set("min_length", size_t_array) + check_empty_then_set("repetition_penalty", float_array) + check_empty_then_set("presence_penalty", float_array) + check_empty_then_set("top_k", size_t_array) + check_empty_then_set("top_p", float_array) + check_empty_then_set("random_seed", size_t_array) + check_empty_then_set("top_p_decay", float_array) + check_empty_then_set("top_p_min", float_array) + check_empty_then_set("top_p_reset_ids", size_t_array) + check_empty_then_set("beam_search_diversity_rate", float_array) + check_empty_then_set("length_penalty", float_array) + + +def test_gpt_json_config(): + model_config = { + "vocab_size": 1000, + "num_layers": 12, + "num_heads": 4, + "hidden_size": 512, + "data_type": _tb.DataType.FLOAT, + } + gpt_model_config = _tb.GptModelConfig(**model_config) + json_config = { + "name": "gpt", + "precision": "float32", + "tensor_parallelism": 1, + "pipeline_parallelism": 1, + "model_config": gpt_model_config + } + + gpt_json_config = _tb.GptJsonConfig(**json_config) + + def check_properties(the_object, properties, model_config): + for property, value in properties.items(): + if isinstance(value, _tb.GptModelConfig): + object_config = getattr(the_object, property) + for subproperty, subvalue in model_config.items(): + member = getattr(object_config, subproperty) + if callable(member): + member = member() + assert member == subvalue + else: + assert getattr(the_object, property) == value + + check_properties(gpt_json_config, json_config, model_config) + + json_dict = { + "builder_config": { + "name": json_config["name"], + "vocab_size": model_config["vocab_size"], + "num_layers": model_config["num_layers"], + "num_heads": model_config["num_heads"], + "hidden_size": model_config["hidden_size"], + "precision": json_config["precision"], + "tensor_parallel": json_config["tensor_parallelism"], + "pipeline_parallel": json_config["pipeline_parallelism"], + }, + "plugin_config": { + "paged_kv_cache": False, + "tokens_per_block": 0, + "gpt_attention_plugin": False, + "remove_input_padding": False, + "use_custom_all_reduce": False, + } + } + + gpt_json_config = _tb.GptJsonConfig.parse(json.dumps(json_dict)) + + with tempfile.NamedTemporaryFile("w", delete=False) as fp: + json.dump(json_dict, fp) + fp.close() + + gpt_json_config = _tb.GptJsonConfig.parse_file(fp.name) + Path(fp.name).unlink() + + rank = 3 + gpus_per_node = 10 + world_config = _tb.WorldConfig(json_config["tensor_parallelism"], + json_config["pipeline_parallelism"], rank, + gpus_per_node) + + assert gpt_json_config.engine_filename( + world_config) == json_config["name"] + "_float32_tp1_rank3.engine" + assert gpt_json_config.engine_filename( + world_config, "llama") == "llama_float32_tp1_rank3.engine" diff --git a/tests/model/test_gptneox.py b/tests/model/test_gptneox.py index f5137acd6e..014c99119d 100644 --- a/tests/model/test_gptneox.py +++ b/tests/model/test_gptneox.py @@ -408,48 +408,6 @@ class TestGPTNeoX(unittest.TestCase): compare_max_abs_error(ref, res, "generation logits") - def test_gptneox_noplugin_unsupported(self): - - use_refit = False - apply_query_key_layer_scaling = False - model = 'gptneox' - - log_level = 'error' - dtype = 'float16' - world_size = 1 - rank = 0 - hidden_act = 'gelu' - n_layer = 1 - max_length = 2 - batch_size = 4 - seq_len = 128 - use_attention_plugin = False - use_ln_gemm_plugin = True - beam_width = 1 - - gpt_config, hf_gpt = self._gen_hf_gpt_neox(hidden_act, n_layer, - seq_len + max_length, dtype) - with self.assertRaisesRegex( - ValueError, - ".*GPT-NeoX RoPE is only supported with GPTAttention plugin.*"): - runtime, _ = self._gen_tensorrt_llm_runtime( - log_level, dtype, world_size, rank, gpt_config, hf_gpt, model, - use_attention_plugin, batch_size, beam_width, seq_len, - max_length, use_refit, use_ln_gemm_plugin, - apply_query_key_layer_scaling) - - use_ln_gemm_plugin = False - if trt.__version__[:3] == '8.6': - with self.assertRaisesRegex( - AssertionError, - "You need to enable the LayerNorm plugin for GPT-NeoX with TensorRT" - ): - runtime, _ = self._gen_tensorrt_llm_runtime( - log_level, dtype, world_size, rank, gpt_config, hf_gpt, - model, use_attention_plugin, batch_size, beam_width, - seq_len, max_length, use_refit, use_ln_gemm_plugin, - apply_query_key_layer_scaling) - if __name__ == '__main__': unittest.main() diff --git a/tests/quantization/test_quant.py b/tests/quantization/test_quant.py index c98ac70109..fd287c21db 100644 --- a/tests/quantization/test_quant.py +++ b/tests/quantization/test_quant.py @@ -16,8 +16,7 @@ import unittest from tensorrt_llm.layers import ColumnLinear, RowLinear from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models import (GPTLMHeadModel, smooth_quantize, - weight_only_quantize) +from tensorrt_llm.models import GPTLMHeadModel, quantize_model from tensorrt_llm.quantization import QuantMode from tensorrt_llm.quantization.layers import (SmoothQuantAttention, SmoothQuantLayerNorm, @@ -39,7 +38,7 @@ class TestQuant(unittest.TestCase): max_position_embeddings=1024, dtype='float16') - quant_model = weight_only_quantize(model, mode) + quant_model = quantize_model(model, mode) self.assertTrue(hasattr(quant_model, 'quant_mode')) @@ -82,9 +81,9 @@ class TestQuant(unittest.TestCase): max_position_embeddings=1024, dtype='float16') - quant_model = weight_only_quantize(model, - mode, - exclude_modules=['fc', 'dense']) + quant_model = quantize_model(model, + mode, + exclude_modules=['fc', 'dense']) self.assertTrue(hasattr(quant_model, 'quant_mode')) @@ -111,7 +110,7 @@ class TestQuant(unittest.TestCase): mapping=Mapping(world_size=1, rank=0, tp_size=1)) quant_mode = QuantMode.use_smooth_quant() - sq_gpt = smooth_quantize(gpt, quant_mode) + sq_gpt = quantize_model(gpt, quant_mode) for layer in sq_gpt.layers: assert isinstance(layer.input_layernorm, SmoothQuantLayerNorm) assert isinstance(layer.post_layernorm, SmoothQuantLayerNorm) diff --git a/tests/tools/plugin_gen/test_plugin_gen.py b/tests/tools/plugin_gen/test_plugin_gen.py index 59886b1e64..7880d2ffcf 100644 --- a/tests/tools/plugin_gen/test_plugin_gen.py +++ b/tests/tools/plugin_gen/test_plugin_gen.py @@ -23,6 +23,13 @@ def is_triton_installed() -> bool: return os.path.exists(TRITON_COMPILE_BIN) -@pytest.mark.skipif(not is_triton_installed(), reason='triton is not installed') +def is_trt_automation() -> bool: + return os.path.exists("/build/config.yml") + + +@pytest.mark.skipif( + not is_triton_installed() or is_trt_automation(), + reason='triton is not installed, this test is not supported in trt automation' +) def test_end_to_end(): gen_trt_plugins(workspace=WORKSPACE, metas=[KERNEL_META_DATA]) diff --git a/windows/README.md b/windows/README.md index d0542e02e2..52ebd0b2da 100644 --- a/windows/README.md +++ b/windows/README.md @@ -30,7 +30,7 @@ Prerequisites: - [TensorRT 9.1.0.4 for TensorRT-LLM](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/secure/9.1.0/tars/tensorrt-9.1.0.4.windows10.x86_64.cuda-12.2.llm.beta.zip) ``` -pip install tensorrt_llm --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/nightly/cu121 +pip install tensorrt_llm --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/cu121 ``` ## Detailed Setup @@ -118,7 +118,7 @@ The above command will generate `build\tensorrt_llm-0.5.0-py3-none-any.whl`. Oth To download and install the wheel, in Powershell, run: ``` -pip install tensorrt_llm --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/nightly/cu121 +pip install tensorrt_llm --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/cu121 ``` Alternatively, if you built the wheel from source, run: