From 9f51f8d20c45069cc3a0580b08808715f4c931f0 Mon Sep 17 00:00:00 2001 From: BatshevaBlack <132911331+BatshevaBlack@users.noreply.github.com> Date: Thu, 21 Aug 2025 05:49:55 +0300 Subject: [PATCH 01/33] [None][infra] Upgrade UCX to v1.19.x and NIXL to 0.5.0 (#7024) Signed-off-by: Batsheva Black <132911331+BatshevaBlack@users.noreply.github.com> Signed-off-by: Bo Deng Co-authored-by: Bo Deng --- .../unit_tests/executor/transferAgentTest.cpp | 6 ++- docker/common/install_nixl.sh | 18 ++++---- docker/common/install_ucx.sh | 43 +++++++++---------- jenkins/current_image_tags.properties | 8 ++-- 4 files changed, 39 insertions(+), 36 deletions(-) diff --git a/cpp/tests/unit_tests/executor/transferAgentTest.cpp b/cpp/tests/unit_tests/executor/transferAgentTest.cpp index c73d9a2140..4745e8e40b 100644 --- a/cpp/tests/unit_tests/executor/transferAgentTest.cpp +++ b/cpp/tests/unit_tests/executor/transferAgentTest.cpp @@ -255,7 +255,8 @@ TEST_F(TransferAgentTest, SyncMessage) checked = nixlAgent0->checkRemoteDescs(agent1, regMem3.getDescs()); } while (!checked); auto syncMessage = std::string("agent_sync_message"); - TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem3.getDescs(), agent1, syncMessage}; + nixlAgent0->notifySyncMessage(agent1, syncMessage); + TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem3.getDescs(), agent1}; auto status = nixlAgent0->submitTransferRequests(writeReq); auto notif = nixlAgent1->getNotifiedSyncMessages(); @@ -302,7 +303,8 @@ TEST_F(TransferAgentTest, SyncMessage) } while (!checked2); std::string syncMessage4 = "four_agent_sync_message"; - TransferRequest writeReq1{TransferOp::kWRITE, regMem2.getDescs(), regMem1.getDescs(), agent0, syncMessage4}; + nixlAgent1->notifySyncMessage(agent0, syncMessage4); + TransferRequest writeReq1{TransferOp::kWRITE, regMem2.getDescs(), regMem1.getDescs(), agent0}; auto status1 = nixlAgent1->submitTransferRequests(writeReq1); auto notif4 = nixlAgent0->getNotifiedSyncMessages(); for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif4.size() == 0; i++) diff --git a/docker/common/install_nixl.sh b/docker/common/install_nixl.sh index 18ee554f69..cecd61a7af 100644 --- a/docker/common/install_nixl.sh +++ b/docker/common/install_nixl.sh @@ -4,8 +4,9 @@ set -ex GITHUB_URL="https://github.com" UCX_INSTALL_PATH="/usr/local/ucx/" CUDA_PATH="/usr/local/cuda" -NIXL_VERSION="0.3.1" +NIXL_VERSION="0.5.0" NIXL_REPO="https://github.com/ai-dynamo/nixl.git" +OLD_LD_LIBRARY_PATH=$LD_LIBRARY_PATH ARCH_NAME="x86_64-linux-gnu" GDS_PATH="$CUDA_PATH/targets/x86_64-linux" @@ -18,25 +19,26 @@ pip3 install --no-cache-dir meson ninja pybind11 git clone --depth 1 -b ${NIXL_VERSION} ${NIXL_REPO} cd nixl -cuda_path=$(find / -name "libcuda.so.1" 2>/dev/null | head -n1) -if [[ -z "$cuda_path" ]]; then - echo "libcuda.so.1 not found " +CUDA_SO_PATH=$(find "/usr/local" -name "libcuda.so.1" 2>/dev/null | head -n1) + +if [[ -z "$CUDA_SO_PATH" ]]; then + echo "libcuda.so.1 not found" exit 1 fi -ln -sf $cuda_path $CUDA_PATH/lib64/libcuda.so.1 +CUDA_SO_PATH=$(dirname $CUDA_SO_PATH) +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_SO_PATH meson setup builddir \ -Ducx_path=$UCX_INSTALL_PATH \ -Dcudapath_lib="$CUDA_PATH/lib64" \ -Dcudapath_inc="$CUDA_PATH/include" \ -Dgds_path="$GDS_PATH" \ - -Dinstall_headers=true \ - -Dstatic_plugins=UCX + -Dinstall_headers=true cd builddir && ninja install cd ../.. rm -rf nixl* # Remove NIXL source tree to save space -rm $CUDA_PATH/lib64/libcuda.so.1 +export LD_LIBRARY_PATH=$OLD_LD_LIBRARY_PATH echo "export LD_LIBRARY_PATH=/opt/nvidia/nvda_nixl/lib/${ARCH_NAME}:/opt/nvidia/nvda_nixl/lib64:\$LD_LIBRARY_PATH" >> "${ENV}" diff --git a/docker/common/install_ucx.sh b/docker/common/install_ucx.sh index 22f444d974..ba35e82ce6 100644 --- a/docker/common/install_ucx.sh +++ b/docker/common/install_ucx.sh @@ -2,29 +2,28 @@ set -ex GITHUB_URL="https://github.com" -UCX_VERSION="v1.18.1" +UCX_VERSION="v1.19.x" UCX_INSTALL_PATH="/usr/local/ucx/" CUDA_PATH="/usr/local/cuda" UCX_REPO="https://github.com/openucx/ucx.git" -if [ ! -d ${UCX_INSTALL_PATH} ]; then - git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO} - cd ucx - ./autogen.sh - ./contrib/configure-release \ - --prefix=${UCX_INSTALL_PATH} \ - --enable-shared \ - --disable-static \ - --disable-doxygen-doc \ - --enable-optimizations \ - --enable-cma \ - --enable-devel-headers \ - --with-cuda=${CUDA_PATH} \ - --with-verbs \ - --with-dm \ - --enable-mt - make install -j$(nproc) - cd .. - rm -rf ucx # Remove UCX source to save space - echo "export LD_LIBRARY_PATH=${UCX_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}" -fi +rm -rf ${UCX_INSTALL_PATH} +git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO} +cd ucx +./autogen.sh +./contrib/configure-release \ + --prefix=${UCX_INSTALL_PATH} \ + --enable-shared \ + --disable-static \ + --disable-doxygen-doc \ + --enable-optimizations \ + --enable-cma \ + --enable-devel-headers \ + --with-cuda=${CUDA_PATH} \ + --with-verbs \ + --with-dm \ + --enable-mt +make install -j$(nproc) +cd .. +rm -rf ucx # Remove UCX source to save space +echo "export LD_LIBRARY_PATH=${UCX_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}" diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index 751f251635..bd46241e51 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -11,7 +11,7 @@ # # NB: Typically, the suffix indicates the PR whose CI pipeline generated the images. In case that # images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead. -LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508130930-6501 -LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508130930-6501 -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202508130930-6501 -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202508130930-6501 +LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test +LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202508201630-pre-test +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202508201630-pre-test From f03053b4ddbfa07c321b5f280cd78f811e153a65 Mon Sep 17 00:00:00 2001 From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com> Date: Wed, 20 Aug 2025 19:52:37 -0700 Subject: [PATCH 02/33] [None][fix] update accelerate dependency to 1.7+ for AutoDeploy (#7077) Signed-off-by: Frida Hou <201670829+Fridah-nv@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e2582f5038..a7821f15db 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ --extra-index-url https://download.pytorch.org/whl/cu128 -c constraints.txt -accelerate>=0.25.0 +accelerate>=1.7.0 build colored cuda-python>=12,<13 From 41ff4901eea835fdcc31a7770fceec363378d748 Mon Sep 17 00:00:00 2001 From: Fan - Yunfan <2569548856@qq.com> Date: Thu, 21 Aug 2025 11:08:11 +0800 Subject: [PATCH 03/33] [None][fix] Fix const modifier inconsistency in log function declaration/implementation (#6679) Signed-off-by: fanyunfan <2569548856@qq.com> Co-authored-by: fanyunfan <2569658856@qq.com> Co-authored-by: Yunfan Fan <46273019+fyf2016@users.noreply.github.com> --- cpp/include/tensorrt_llm/common/logger.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/include/tensorrt_llm/common/logger.h b/cpp/include/tensorrt_llm/common/logger.h index df84e22638..c8164b10e5 100644 --- a/cpp/include/tensorrt_llm/common/logger.h +++ b/cpp/include/tensorrt_llm/common/logger.h @@ -54,20 +54,21 @@ public: #if defined(_MSC_VER) template - void log(Level level, char const* format, Args const&... args); + void log(Level const level, char const* format, Args const&... args); template - void log(Level level, int rank, char const* format, Args const&... args); + void log(Level const level, int const rank, char const* format, Args const&... args); #else template - void log(Level level, char const* format, Args const&... args) __attribute__((format(printf, 3, 0))); + void log(Level const level, char const* format, Args const&... args) __attribute__((format(printf, 3, 0))); template - void log(Level level, int rank, char const* format, Args const&... args) __attribute__((format(printf, 4, 0))); + void log(Level const level, int const rank, char const* format, Args const&... args) + __attribute__((format(printf, 4, 0))); #endif template - void log(Level level, std::string const& format, Args const&... args) + void log(Level const level, std::string const& format, Args const&... args) { return log(level, format.c_str(), args...); } @@ -134,7 +135,7 @@ private: }; template -void Logger::log(Logger::Level level, char const* format, Args const&... args) +void Logger::log(Logger::Level const level, char const* format, Args const&... args) { if (isEnabled(level)) { From 21f4434404260333ced9b5a42241f215a5c09bd5 Mon Sep 17 00:00:00 2001 From: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> Date: Thu, 21 Aug 2025 11:15:23 +0800 Subject: [PATCH 04/33] [None][chore] waive failed cases on H100 (#7084) Signed-off-by: Xin He (SW-GPU) <200704525+xinhe-nv@users.noreply.github.com> --- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 1 + tests/integration/test_lists/qa/llm_function_full.txt | 1 - tests/integration/test_lists/qa/llm_function_sanity.txt | 1 - tests/integration/test_lists/waives.txt | 4 ++++ 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 7af8c437d0..2a3a3c9172 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1734,6 +1734,7 @@ class TestKimiK2(LlmapiAccuracyTestHarness): MODEL_PATH = f"{llm_models_root()}/Kimi-K2-Instruct" @pytest.mark.skip_less_mpi_world_size(8) + @skip_post_blackwell @skip_pre_hopper @pytest.mark.parametrize( "tp_size,pp_size,ep_size,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size", diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt index 9e6e12b400..15eb0064ad 100644 --- a/tests/integration/test_lists/qa/llm_function_full.txt +++ b/tests/integration/test_lists/qa/llm_function_full.txt @@ -631,7 +631,6 @@ test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-S test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1] test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-video-False] -test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-image-False] test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-video-False] test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-image-False] diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt index c977a77d3c..c28c9efc48 100644 --- a/tests/integration/test_lists/qa/llm_function_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_sanity.txt @@ -164,7 +164,6 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.2-11B-BF16-llama-3.2-models/Lla test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B] test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] -test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index db4f919855..78884a9d59 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -316,3 +316,7 @@ disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1 examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5431146) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] SKIP (https://nvbugs/5464461) disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5448449) +full:H100/accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=True] SKIP (https://nvbugs/5467815) +full:H100/accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=False] SKIP (https://nvbugs/5467815) +full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True] SKIP (https://nvbugs/5467815) +full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True] SKIP (https://nvbugs/5467815) From cbcea33279ea76b0f932b502a46b814ecb6a01a8 Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Thu, 21 Aug 2025 13:12:21 +0800 Subject: [PATCH 05/33] [fix]: use safeInitRowMax instead of fp32_lowest to avoid NaN (#7087) Signed-off-by: Yao Yao --- cpp/kernels/xqa/mha_sm90.cu | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/kernels/xqa/mha_sm90.cu b/cpp/kernels/xqa/mha_sm90.cu index 9a438df9a2..da44fba60c 100644 --- a/cpp/kernels/xqa/mha_sm90.cu +++ b/cpp/kernels/xqa/mha_sm90.cu @@ -1012,7 +1012,7 @@ CUBIN_EXPORT __global__ if (threadIdx.x < smem.gemm1AccColMax.size) { auto const idx = threadIdx.x; - smem.gemm1AccColMax[idx] = mha::numeric_limits::lowest(); + smem.gemm1AccColMax[idx] = safeInitRowMax; smem.gemm1AccColSum[idx] = 0; } smem.gemm1WarpGrpBar.arrive_and_wait(); @@ -1949,7 +1949,7 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec, uint32_t const globalRow = tileStartRow + row; if (globalRow >= cacheSeqLen) { - acc(m, n)(i, j) = mha::numeric_limits::lowest(); + acc(m, n)(i, j) = safeInitRowMax; continue; } if (globalRow >= maskStartRow) @@ -1957,7 +1957,7 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec, uint32_t const maskRow = globalRow - maskStartRow; if ((bit_mask >> maskRow) == 0) { - acc(m, n)(i, j) = mha::numeric_limits::lowest(); + acc(m, n)(i, j) = safeInitRowMax; } } } @@ -2087,7 +2087,7 @@ __device__ inline void warpGrpApplyMask(uint32_t warpRank, Gemm0Acc& acc, uint32 #pragma unroll for (uint32_t j = 0; j < GmmaAccCoreMat::cols; j++) { - acc(m, n)(i, j) = mha::numeric_limits::lowest(); + acc(m, n)(i, j) = safeInitRowMax; } } } @@ -2380,9 +2380,9 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, SpecDec const& specDec, { uint32_t const col = GmmaAccCoreMat::cols * (4 * n + idxInQuad) + j; assert((col < nbValidCols) == bool(endMask & (1ULL << col))); - if (((mask >> col) & 1) == 0) + if ((mask & (1ULL << col)) == 0) { - acc(m, n)(i, j) = mha::numeric_limits::lowest(); + acc(m, n)(i, j) = safeInitRowMax; } } } @@ -2410,7 +2410,7 @@ __device__ inline void warpGrpApplyMask(Gemm0Acc& acc, uint32_t validColBeg, uin #pragma unroll for (uint32_t i = 0; i < GmmaAccCoreMat::rows; i++) { - acc(m, n)(i, j) = mha::numeric_limits::lowest(); + acc(m, n)(i, j) = safeInitRowMax; } } } From 647a52698ae038c49de7eae0ff6ae70c96792839 Mon Sep 17 00:00:00 2001 From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com> Date: Wed, 20 Aug 2025 22:14:51 -0700 Subject: [PATCH 06/33] [https://nvbugs/5443039][fix] Fix AutoDeploy pattern matcher for torch 2.8 (#7076) Signed-off-by: Frida Hou <201670829+Fridah-nv@users.noreply.github.com> --- tensorrt_llm/_torch/auto_deploy/utils/pattern_matcher.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/_torch/auto_deploy/utils/pattern_matcher.py b/tensorrt_llm/_torch/auto_deploy/utils/pattern_matcher.py index 00b535dec6..e0e21b1d70 100644 --- a/tensorrt_llm/_torch/auto_deploy/utils/pattern_matcher.py +++ b/tensorrt_llm/_torch/auto_deploy/utils/pattern_matcher.py @@ -43,11 +43,13 @@ def _patch_unsupported_input_tensor(): """ original_fn = lowering.unsupported_input_tensor - def patched_fn(t: torch.Tensor, parent=None, node=None): + def patched_fn(t: torch.Tensor, *args, **kwargs): """Bypass meta tensor check.""" if t.is_meta: return False - return original_fn(t, parent, node) + return original_fn( + t, *args, **kwargs + ) # a generic pass-through of the arguments to accommodate torch side change lowering.unsupported_input_tensor = patched_fn try: From ba0a86e0bb7e18118e227c47a7b7ac3bf8ba24ca Mon Sep 17 00:00:00 2001 From: bhsueh_NV <11360707+byshiue@users.noreply.github.com> Date: Thu, 21 Aug 2025 13:17:32 +0800 Subject: [PATCH 07/33] [https://nvbugs/5437405][fix] qwen3 235b eagle3 ci (#7000) Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com> --- .../defs/accuracy/test_llm_api_pytorch.py | 51 +++++++++++++++++-- .../test_lists/qa/llm_function_full.txt | 2 +- .../test_lists/qa/llm_function_sanity.txt | 2 +- .../test_lists/test-db/l0_gb200.yml | 1 + .../test-db/l0_gb200_multi_nodes.yml | 1 - tests/integration/test_lists/waives.txt | 1 - 6 files changed, 51 insertions(+), 7 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 2a3a3c9172..8879904627 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2446,11 +2446,12 @@ class TestQwen3_235B_A22B(LlmapiAccuracyTestHarness): [ (8, 1, 8, True, True, True, "CUTLASS", False), (8, 1, 8, True, True, True, "TRTLLM", False), - (8, 1, 8, False, False, False, "TRTLLM", True), + (8, 1, 8, True, True, True, "TRTLLM", True), ], ids=[ - "latency_moe_cutlass", "latency_moe_trtllm", - "latency_moe_trtllm_eagle3" + "latency_moe_cutlass", + "latency_moe_trtllm", + "latency_moe_trtllm_eagle3", ], ) def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, @@ -2485,6 +2486,50 @@ class TestQwen3_235B_A22B(LlmapiAccuracyTestHarness): task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + @skip_pre_blackwell + @pytest.mark.skip_less_mpi_world_size(4) + @pytest.mark.parametrize( + "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3", + [ + (4, 1, 4, False, False, False, "TRTLLM", + True), # TP8 has bug when we use TRTLLM moe backend and eagle3 + ], + ids=[ + "latency_moe_trtllm_eagle3", + ], + ) + def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp, + cuda_graph, overlap_scheduler, moe_backend, eagle3): + + pytorch_config = dict( + disable_overlap_scheduler=not overlap_scheduler, + cuda_graph_config=CudaGraphConfig() if cuda_graph else None, + moe_config=MoeConfig(backend=moe_backend)) + + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4, + enable_block_reuse=not eagle3) + spec_config = None + if eagle3: + spec_config = EagleDecodingConfig( + max_draft_len=2, + speculative_model_dir= + f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/", + eagle3_one_model=True) + with LLM( + f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + **pytorch_config, + enable_attention_dp=attention_dp, + kv_cache_config=kv_cache_config, + speculative_config=spec_config) as llm: + + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness): MODEL_NAME = "microsoft/Phi-4-mini-instruct" diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt index 15eb0064ad..e28f1bcecd 100644 --- a/tests/integration/test_lists/qa/llm_function_full.txt +++ b/tests/integration/test_lists/qa/llm_function_full.txt @@ -579,7 +579,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency] accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8 diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt index c28c9efc48..51c452cbc7 100644 --- a/tests/integration/test_lists/qa/llm_function_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_sanity.txt @@ -116,7 +116,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True] -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM] diff --git a/tests/integration/test_lists/test-db/l0_gb200.yml b/tests/integration/test_lists/test-db/l0_gb200.yml index ac39fbdc88..7d1cc92fef 100644 --- a/tests/integration/test_lists/test-db/l0_gb200.yml +++ b/tests/integration/test_lists/test-db/l0_gb200.yml @@ -69,3 +69,4 @@ l0_gb200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2] + - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] TIMEOUT (90) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml index 9c04ad7090..857319c44c 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml @@ -19,4 +19,3 @@ l0_gb200_multi_nodes: - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90) - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90) - - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] TIMEOUT (90) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 78884a9d59..86af57819b 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -263,7 +263,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5434451) examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451) examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451) -accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437405,https://nvbugs/5437384) accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 SKIP (https://nvbugs/5440241) test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095) test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095) From 2d40e8750bf0d1234646a760bd341ec056f90dd7 Mon Sep 17 00:00:00 2001 From: Farshad Ghodsian <47931571+farshadghodsian@users.noreply.github.com> Date: Thu, 21 Aug 2025 02:33:07 -0400 Subject: [PATCH 08/33] [None][doc] Update gpt-oss deployment guide to latest release image (#7101) Signed-off-by: Farshad Ghodsian <47931571+farshadghodsian@users.noreply.github.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- README.md | 4 +-- .../blog9_Deploying_GPT_OSS_on_TRTLLM.md | 29 +++++++++---------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index f6625a0559..1559ee4d00 100644 --- a/README.md +++ b/README.md @@ -18,10 +18,9 @@ TensorRT-LLM
## Tech Blogs -* [08/06] Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM +* [08/05] Running a High-Performance GPT-OSS-120B Inference Server with TensorRT-LLM ✨ [➡️ link](./docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md) - * [08/01] Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization) ✨ [➡️ link](./docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md) @@ -44,6 +43,7 @@ TensorRT-LLM ✨ [➡️ link](./docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md) ## Latest News +* [08/05] 🌟 TensorRT-LLM delivers Day-0 support for OpenAI's latest open-weights models: GPT-OSS-120B [➡️ link](https://huggingface.co/openai/gpt-oss-120b) and GPT-OSS-20B [➡️ link](https://huggingface.co/openai/gpt-oss-20b) * [07/15] 🌟 TensorRT-LLM delivers Day-0 support for LG AI Research's latest model, EXAONE 4.0 [➡️ link](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B) * [06/17] Join NVIDIA and DeepInfra for a developer meetup on June 26 ✨ [➡️ link](https://events.nvidia.com/scaletheunscalablenextgenai) * [05/22] Blackwell Breaks the 1,000 TPS/User Barrier With Meta’s Llama 4 Maverick diff --git a/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md b/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md index 8f5c1dfec0..87432173b4 100644 --- a/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md +++ b/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md @@ -19,11 +19,11 @@ We have a forthcoming guide for achieving great performance on H100; however, th In this section, we introduce several ways to install TensorRT-LLM. -### NGC Docker Image of dev branch +### NGC Docker Image -Day-0 support for gpt-oss is provided via the NGC container image `nvcr.io/nvidia/tensorrt-llm/release:gpt-oss-dev`. This image was built on top of the pre-day-0 **dev branch**. This container is multi-platform and will run on both x64 and arm64 architectures. +Visit the [NGC TensorRT-LLM Release page](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release) to find the most up-to-date NGC container image to use. You can also check the latest [release notes](https://github.com/NVIDIA/TensorRT-LLM/releases) to keep track of the support status of the latest releases. -Run the following docker command to start the TensorRT-LLM container in interactive mode: +Run the following Docker command to start the TensorRT-LLM container in interactive mode (change the image tag to match latest release): ```bash docker run --rm --ipc=host -it \ @@ -33,7 +33,7 @@ docker run --rm --ipc=host -it \ -p 8000:8000 \ -e TRTLLM_ENABLE_PDL=1 \ -v ~/.cache:/root/.cache:rw \ - nvcr.io/nvidia/tensorrt-llm/release:gpt-oss-dev \ + nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc0 \ /bin/bash ``` @@ -53,9 +53,9 @@ Additionally, the container mounts your user `.cache` directory to save the down Support for gpt-oss has been [merged](https://github.com/NVIDIA/TensorRT-LLM/pull/6645) into the **main branch** of TensorRT-LLM. As we continue to optimize gpt-oss performance, you can build TensorRT-LLM from source to get the latest features and support. Please refer to the [doc](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html) if you want to build from source yourself. -### Regular Release of TensorRT-LLM +### TensorRT-LLM Python Wheel Install -Since gpt-oss has been supported on the main branch, you can get TensorRT-LLM out of the box through its regular release in the future. Please check the latest [release notes](https://github.com/NVIDIA/TensorRT-LLM/releases) to keep track of the support status. The release is provided as [NGC Container Image](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags) or [pip Python wheel](https://pypi.org/project/tensorrt-llm/#history). You can find instructions on pip install [here](https://nvidia.github.io/TensorRT-LLM/installation/linux.html). +Regular releases of TensorRT-LLM are also provided as [Python wheels](https://pypi.org/project/tensorrt-llm/#history). You can find instructions on the pip install [here](https://nvidia.github.io/TensorRT-LLM/installation/linux.html). ## Performance Benchmarking and Model Serving @@ -210,7 +210,10 @@ We can use `trtllm-serve` to serve the model by translating the benchmark comman ```bash trtllm-serve \ - gpt-oss-120b \ # Or ${local_model_path} +Note: You can also point to a local path containing the model weights instead of the HF repo (e.g., `${local_model_path}`). + +trtllm-serve \ + openai/gpt-oss-120b \ --host 0.0.0.0 \ --port 8000 \ --backend pytorch \ @@ -228,7 +231,8 @@ For max-throughput configuration, run: ```bash trtllm-serve \ - gpt-oss-120b \ # Or ${local_model_path} +trtllm-serve \ + openai/gpt-oss-120b \ --host 0.0.0.0 \ --port 8000 \ --backend pytorch \ @@ -262,7 +266,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d ' "messages": [ { "role": "user", - "content": "What is NVIDIA's advantage for inference?" + "content": "What is NVIDIAs advantage for inference?" } ], "max_tokens": 1024, @@ -348,12 +352,7 @@ others according to your needs. ## (H200/H100 Only) Using OpenAI Triton Kernels for MoE -OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT-LLM can leverage these kernels for Hopper-based GPUs like NVIDIA's H200 for optimal performance. `TRTLLM` MoE backend is not supported on Hopper, and `CUTLASS` backend support is still ongoing. Please enable `TRITON` backend with the steps below if you are running on Hopper GPUs. - -### Installing OpenAI Triton - -The `nvcr.io/nvidia/tensorrt-llm/release:gpt-oss-dev` has prepared Triton already (`echo $TRITON_ROOT` could reveal the path). In other situations, you will need to build and install a specific version of Triton. Please follow the instructions in this [link](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/gpt_oss#using-openai-triton-kernels-for-moe). - +OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT-LLM can leverage these kernels for Hopper-based GPUs like NVIDIA's H200 for optimal performance. `TRTLLM` MoE backend is not supported on Hopper, and `CUTLASS` backend support is still ongoing. Please follow the instructions in this [link](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/gpt_oss#using-openai-triton-kernels-for-moe) to install and enable the `TRITON` MoE kernels on Hopper GPUs. ### Selecting Triton as the MoE backend From c7269ea93a473ed016904179d0592eba73a42d13 Mon Sep 17 00:00:00 2001 From: ChristinaZ <83400082+ChristinaZ@users.noreply.github.com> Date: Thu, 21 Aug 2025 16:58:41 +0800 Subject: [PATCH 09/33] [https://nvbugs/5392414] [fix] Add customized default routing method (#6818) Signed-off-by: Christina Zhang <83400082+ChristinaZ@users.noreply.github.com> --- .../kernels/customMoeRoutingKernels.cu | 268 +++++++++++++ ...ingKernels.h => customMoeRoutingKernels.h} | 4 +- cpp/tensorrt_llm/kernels/moeTopKFuncs.cuh | 205 ++++++++++ .../kernels/renormMoeRoutingKernels.cu | 376 ------------------ cpp/tensorrt_llm/kernels/topkLastDim.cu | 250 ++++++++++-- .../blockScaleMoe/RoutingKernel.cuh | 10 +- .../blockScaleMoe/RoutingLlama4.cu | 2 - .../blockScaleMoe/RoutingRenormalize.cu | 2 - cpp/tensorrt_llm/thop/CMakeLists.txt | 2 +- ...oeRoutingOp.cpp => customMoeRoutingOp.cpp} | 34 +- .../_torch/custom_ops/cpp_custom_ops.py | 8 + .../_torch/modules/fused_moe/routing.py | 16 +- 12 files changed, 742 insertions(+), 435 deletions(-) create mode 100644 cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu rename cpp/tensorrt_llm/kernels/{renormMoeRoutingKernels.h => customMoeRoutingKernels.h} (86%) create mode 100644 cpp/tensorrt_llm/kernels/moeTopKFuncs.cuh delete mode 100644 cpp/tensorrt_llm/kernels/renormMoeRoutingKernels.cu rename cpp/tensorrt_llm/thop/{renormMoeRoutingOp.cpp => customMoeRoutingOp.cpp} (75%) diff --git a/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu b/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu new file mode 100644 index 0000000000..eb3b958eb2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "moeTopKFuncs.cuh" +#include "tensorrt_llm/common/cudaTypeUtils.cuh" +#include "tensorrt_llm/common/envUtils.h" +#include "tensorrt_llm/kernels/archCondition.h" +#include "tensorrt_llm/kernels/customMoeRoutingKernels.h" +#include // For INT_MAX +#include +#include +#include +#include // For numeric_limits +#include + +namespace cg = cooperative_groups; +using namespace tensorrt_llm::common; + +namespace tensorrt_llm::kernels +{ + +static constexpr int BLOCK_SIZE = 1024; +static constexpr int WARP_SIZE = 32; +static constexpr int WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +__device__ T calcSoftmax(cg::thread_block_tile const& warp, T score, int32_t laneIdx, int32_t NumTopExperts) +{ + T maxScore = T{-INFINITY}; + if (laneIdx < NumTopExperts) + { + maxScore = score >= maxScore ? score : maxScore; + } + maxScore = cg::reduce(warp, maxScore, cg::greater()); + + float sumScore{0.f}; + float newScore; + // Get the summation of scores for each token + if (laneIdx < NumTopExperts) + { + newScore = static_cast(score) - static_cast(maxScore); + newScore = static_cast(exp(newScore)); + sumScore += newScore; + } + sumScore = cg::reduce(warp, sumScore, cg::plus()); + + if (laneIdx < NumTopExperts) + { + score = static_cast(newScore / sumScore); + } + + return score; +} + +template +__device__ void calcSoftmax(cg::thread_block_tile const& warp, DataType (&scores)[VecSize]) +{ + DataType maxScore = DataType{-INFINITY}; + DataType sumScore = DataType{0.f}; + + // Get the max score for each token +#pragma unroll + for (int i = 0; i < VecSize; ++i) + { + maxScore = scores[i] >= maxScore ? scores[i] : maxScore; + } + maxScore = cg::reduce(warp, maxScore, cg::greater()); + + // Get the summation of scores for each token +#pragma unroll + for (int i = 0; i < VecSize; ++i) + { + scores[i] = static_cast(exp(scores[i] - maxScore)); + sumScore += scores[i]; + } + sumScore = cg::reduce(warp, sumScore, cg::plus()); + + // Normalize the scores +#pragma unroll + for (int i = 0; i < VecSize; ++i) + { + scores[i] = static_cast(scores[i] / sumScore); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +__global__ void customMoeRoutingKernel(InputT* routerLogits, OutputT* topkValues, IdxT* topkIndices, + int32_t const numTokens, int32_t const numExperts, int32_t const topK) +{ + using BaseType = std::conditional_t; + uint32_t const blockRank = blockIdx.x; + uint32_t const tIdx = BLOCK_SIZE * blockRank + threadIdx.x; + uint32_t const warpIdx = tIdx / WARP_SIZE; + uint32_t const laneIdx = tIdx % WARP_SIZE; + uint32_t const warpNum = gridDim.x * WARPS_PER_BLOCK; + auto block = cg::this_thread_block(); + auto warp = cg::tiled_partition(block); + + BaseType minScore = BaseType{-INFINITY}; + for (uint32_t tokenId = warpIdx; tokenId < numTokens; tokenId += warpNum) + { + auto scoreOffset = tokenId * numExperts; + auto outputOffset = tokenId * topK; + + BaseType inputScore[MaxNumExperts / WARP_SIZE]; + IdxT inputIndex[MaxNumExperts / WARP_SIZE]; + + BaseType warpTopKScore[MaxNumTopExperts]; + IdxT warpTopKExpertIdx[MaxNumTopExperts]; + + // Load scores and indices for this warp + for (uint32_t i = 0; i < MaxNumExperts / WARP_SIZE; ++i) + { + auto expertIdx = i * WARP_SIZE + laneIdx; + inputScore[i] + = expertIdx < numExperts ? static_cast(routerLogits[scoreOffset + expertIdx]) : minScore; + inputIndex[i] = expertIdx; + } + + if constexpr (DoSoftmaxBeforeTopK) + { + calcSoftmax(warp, inputScore); + } + // Reduce topK scores and indices for this warp + reduce_topk::reduceTopK(warp, warpTopKScore, warpTopKExpertIdx, inputScore, inputIndex, minScore); + + // Normalize the scores + if constexpr (DoSoftmaxBeforeTopK) + { + if (laneIdx < topK) + { + topkValues[outputOffset + laneIdx] = static_cast(warpTopKScore[laneIdx]); + topkIndices[outputOffset + laneIdx] = warpTopKExpertIdx[laneIdx]; + } + } + else + { + auto softmaxScore = calcSoftmax(warp, + laneIdx < topK ? static_cast(warpTopKScore[laneIdx]) : static_cast(minScore), laneIdx, + topK); + if (laneIdx < topK) + { + topkValues[outputOffset + laneIdx] = static_cast(softmaxScore); + topkIndices[outputOffset + laneIdx] = warpTopKExpertIdx[laneIdx]; + } + } + } // end for tokenId +} + +int nextPowerOfTwo(int num) +{ + if (num <= 0) + { + return 1; // Handle invalid input + } + int power = 1; + while (power < num) + { + // Check for overflow before shifting + if (power > INT_MAX / 2) + { + return power; + } + power <<= 1; + } + return power; +} + +#define CASE(MAX_NUM_EXPERTS) \ + case MAX_NUM_EXPERTS: \ + switch (maxNumTopExperts) \ + { \ + case 1: \ + kernelInstance = &customMoeRoutingKernel; \ + break; \ + case 2: \ + kernelInstance = &customMoeRoutingKernel; \ + break; \ + case 4: \ + kernelInstance = &customMoeRoutingKernel; \ + break; \ + case 8: \ + kernelInstance = &customMoeRoutingKernel; \ + break; \ + default: kernelInstance = nullptr; break; \ + } \ + break; + +template +void invokeRenormMoeRouting(InputT* routerLogits, OutputT* topkValues, IdxT* topkIndices, int64_t const numTokens, + int64_t const numExperts, int64_t const topK, cudaStream_t const stream) +{ + + const uint32_t maxNumBlocks = 1024; + const uint32_t numBlocks = std::min(static_cast((numTokens - 1) / WARPS_PER_BLOCK + 1), maxNumBlocks); + + uint32_t maxNumExperts = nextPowerOfTwo(numExperts) < 32 ? 32 : nextPowerOfTwo(numExperts); + uint32_t maxNumTopExperts = nextPowerOfTwo(topK); + + auto* kernelInstance = &customMoeRoutingKernel; + + switch (maxNumExperts) + { + CASE(32) + CASE(64) + CASE(96) + CASE(128) + default: kernelInstance = nullptr; break; + } + + if (kernelInstance == nullptr) + { + TLLM_CHECK_WITH_INFO(kernelInstance != nullptr, "Can not find corresponding kernel instance."); + } + + dim3 renormMoeRoutingGridDim(numBlocks); + dim3 renormMoeRoutingBlockDim(BLOCK_SIZE); + cudaLaunchConfig_t config; + config.gridDim = renormMoeRoutingGridDim; + config.blockDim = renormMoeRoutingBlockDim; + config.dynamicSmemBytes = 0; + config.stream = stream; + cudaLaunchAttribute attrs[1]; + attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; + attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL(); + config.numAttrs = 1; + config.attrs = attrs; + cudaLaunchKernelEx(&config, kernelInstance, routerLogits, topkValues, topkIndices, static_cast(numTokens), + static_cast(numExperts), static_cast(topK)); + sync_check_cuda_error(stream); +} + +#define INSTANTIATE_RENORM_MOE_ROUTING(InputT, OutputT, IdxT, DoSoftmaxBeforeTopK) \ + template void invokeRenormMoeRouting(InputT * routerLogits, \ + OutputT * topkValues, IdxT * topkIndices, int64_t const numTokens, int64_t const numExperts, \ + int64_t const topK, cudaStream_t const stream); + +INSTANTIATE_RENORM_MOE_ROUTING(float, float, int32_t, false); +INSTANTIATE_RENORM_MOE_ROUTING(half, float, int32_t, false); +#ifdef ENABLE_BF16 +INSTANTIATE_RENORM_MOE_ROUTING(__nv_bfloat16, float, int32_t, false); +#endif + +INSTANTIATE_RENORM_MOE_ROUTING(float, float, int32_t, true); +INSTANTIATE_RENORM_MOE_ROUTING(half, float, int32_t, true); +#ifdef ENABLE_BF16 +INSTANTIATE_RENORM_MOE_ROUTING(__nv_bfloat16, float, int32_t, true); +#endif + +} // namespace tensorrt_llm::kernels diff --git a/cpp/tensorrt_llm/kernels/renormMoeRoutingKernels.h b/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.h similarity index 86% rename from cpp/tensorrt_llm/kernels/renormMoeRoutingKernels.h rename to cpp/tensorrt_llm/kernels/customMoeRoutingKernels.h index 1e9b001f65..cfe0ae8f15 100644 --- a/cpp/tensorrt_llm/kernels/renormMoeRoutingKernels.h +++ b/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,7 +23,7 @@ namespace tensorrt_llm::kernels { -template +template void invokeRenormMoeRouting(InputT* routerLogits, OutputT* topkValues, IdxT* topkIndices, int64_t const numTokens, int64_t const numExperts, int64_t const topK, cudaStream_t const stream); } // namespace tensorrt_llm::kernels diff --git a/cpp/tensorrt_llm/kernels/moeTopKFuncs.cuh b/cpp/tensorrt_llm/kernels/moeTopKFuncs.cuh new file mode 100644 index 0000000000..933b599dbd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/moeTopKFuncs.cuh @@ -0,0 +1,205 @@ + +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#ifndef TRTLLM_MOETOPKFUNCS_CUH_H +#define TRTLLM_MOETOPKFUNCS_CUH_H + +#include +#include +#include + +#include "tensorrt_llm/kernels/archCondition.h" + +namespace tensorrt_llm::kernels +{ + +namespace reduce_topk +{ +namespace cg = cooperative_groups; +static constexpr int kWARP_SIZE = 32; +static constexpr bool kTLLM_GEN_HAS_FAST_REDUX = tensorrt_llm::kernels::arch::is_major_v<10>; + +template +struct TopKRedType +{ + using T = T_; + static_assert(std::is_same_v || std::is_same_v || std::is_same_v + || std::is_same_v, + "Top K reduction only implemented for int, float, float16 and bfloat16"); + + using TypeCmp = std::conditional_t; + using IdxT = std::conditional_t; + + static constexpr int kMoveBits = (sizeof(T) == 4) ? 32 : 16; + static constexpr int kMaxIdx = 65535; + TypeCmp compValIdx; + + static __host__ __device__ inline TypeCmp makeCmpVal(T val, int32_t idx = 0) + { + auto valueBits = cub::Traits::TwiddleIn(reinterpret_cast::UnsignedBits&>(val)); + TypeCmp compactTmp = reinterpret_cast(valueBits); + compactTmp = (compactTmp << kMoveBits) | (0xFFFF & (kMaxIdx - idx)); + // Use 65535 minus idx to give higher priority to elements with smaller indices. + return compactTmp; + } + + static __host__ __device__ void unpack(T& value, int32_t& index, TypeCmp cmp) + { + // Since “65535-idx” is always smaller than 65536 and positive, we can directly use it as the lower 16 bits + index = kMaxIdx - static_cast((cmp & 0xFFFF)); + + auto compactTmp = cmp >> kMoveBits; + auto valueBits + = cub::Traits::TwiddleOut(reinterpret_cast::UnsignedBits&>(compactTmp)); + value = reinterpret_cast(valueBits); + } + + __host__ __device__ TopKRedType() = default; + + __host__ __device__ TopKRedType(T val, int32_t idx) + : compValIdx(makeCmpVal(val, idx)) + { + } + + __host__ __device__ operator TypeCmp() const noexcept + { + return compValIdx; + } + + __device__ inline TypeCmp reduce(cg::thread_block_tile const& warp) + { + if constexpr (!kTLLM_GEN_HAS_FAST_REDUX || sizeof(TypeCmp) == 8) + { + return cg::reduce(warp, compValIdx, cg::greater{}); + } + else + { + TypeCmp result; + asm("redux.sync.max.u32 %0, %1, 0xffffffff;\n" : "=r"(result) : "r"(compValIdx)); + return result; + } + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct TopKIdx +{ + // by default, empty +}; + +template +struct TopKIdx +{ + static constexpr int K = K_; + int32_t val[K]; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#define TOPK_SWAP(I, J) \ + { \ + auto pairMin = min(topK[I].compValIdx, topK[J].compValIdx); \ + auto pairMax = max(topK[I].compValIdx, topK[J].compValIdx); \ + topK[I].compValIdx = pairMax; \ + topK[J].compValIdx = pairMin; \ + } + +template +struct Sort; + +template +struct Sort<1, RedType> +{ + static __device__ void run(RedType* topK) {} +}; + +template +struct Sort<2, RedType> +{ + static __device__ void run(RedType* topK) + { + TOPK_SWAP(0, 1); + } +}; + +template +struct Sort<3, RedType> +{ + static __device__ void run(RedType* topK) + { + TOPK_SWAP(0, 1); + TOPK_SWAP(1, 2); + TOPK_SWAP(0, 1); + } +}; + +template +struct Sort<4, RedType> +{ + static __device__ void run(RedType* topK) + { + TOPK_SWAP(0, 2); + TOPK_SWAP(1, 3); + TOPK_SWAP(0, 1); + TOPK_SWAP(2, 3); + TOPK_SWAP(1, 2); + } +}; + +template +__device__ void reduceTopK(cg::thread_block_tile const& warp, Type (&out)[K], int32_t (&outIdx)[K], + Type (&value)[N], int32_t (&idx)[N], Type minValue) +{ + static_assert(K > 0, "Top K must have K > 0"); + static_assert(K < kWARP_SIZE, "Top K must have K < kWARP_SIZE"); + static_assert(N > 0, "Top K must have N > 0"); + static_assert(N < 5, "Only support candidates number less than or equal to 128"); + using RedType = TopKRedType; + RedType topK[N]; +#pragma unroll + for (int nn = 0; nn < N; ++nn) + { + topK[nn] = RedType{value[nn], idx[nn]}; + } + + if constexpr (!IsSorted) + { + Sort::run(topK); + } + typename RedType::TypeCmp packedMax{}; +#pragma unroll + for (int kk = 0; kk < K; ++kk) + { + bool update = kk > 0 && packedMax == topK[0].compValIdx; +#pragma unroll + for (int nn = 0; nn < N; ++nn) + { + topK[nn] = update && nn == N - 1 ? RedType{minValue, idx[nn]} : update ? topK[nn + 1] : topK[nn]; + } + // get the next largest value + packedMax = topK[0].reduce(warp); + RedType::unpack(out[kk], outIdx[kk], packedMax); + } +}; + +#undef TOPK_SWAP + +} // namespace reduce_topk +} // namespace tensorrt_llm::kernels +#endif // TRTLLM_MOETOPKFUNCS_CUH_H diff --git a/cpp/tensorrt_llm/kernels/renormMoeRoutingKernels.cu b/cpp/tensorrt_llm/kernels/renormMoeRoutingKernels.cu deleted file mode 100644 index 1b4239e48c..0000000000 --- a/cpp/tensorrt_llm/kernels/renormMoeRoutingKernels.cu +++ /dev/null @@ -1,376 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "tensorrt_llm/common/cudaTypeUtils.cuh" -#include "tensorrt_llm/common/envUtils.h" -#include "tensorrt_llm/kernels/archCondition.h" -#include "tensorrt_llm/kernels/renormMoeRoutingKernels.h" -#include // For INT_MAX -#include -#include -#include -#include // For numeric_limits -#include - -namespace cg = cooperative_groups; -using namespace tensorrt_llm::common; - -namespace tensorrt_llm::kernels -{ - -static constexpr int BLOCK_SIZE = 1024; -static constexpr int WARP_SIZE = 32; -static constexpr int WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE; - -namespace reduce_topk -{ - -static constexpr bool TLLM_GEN_HAS_FAST_REDUX = tensorrt_llm::kernels::arch::is_major_v<10>; - -template -struct TopKRedType -{ - using T = T_; - static_assert(std::is_same_v || std::is_same_v || std::is_same_v, - "Top K reduction only implemented for float, float16 and bfloat16"); - - using TypeCmp = std::conditional_t; - using IdxT = std::conditional_t; - static constexpr int moveBits = (sizeof(T) == 4) ? 32 : 16; - static constexpr int maxIdx = 65535; - TypeCmp compValIdx; - - static __host__ __device__ inline TypeCmp makeCmpVal(T val, int32_t idx = 0) - { - auto valueBits = cub::Traits::TwiddleIn(reinterpret_cast::UnsignedBits&>(val)); - TypeCmp compactTmp = reinterpret_cast(valueBits); - compactTmp = (compactTmp << moveBits) | (0xFFFF & (maxIdx - idx)); - // Use 65535 minus idx to give higher priority to elements with smaller indices. - return compactTmp; - } - - static __host__ __device__ void unpack(T& value, int32_t& index, TypeCmp cmp) - { - // Since “65535-idx” is always smaller than 65536 and positive, we can directly use it as the lower 16 bits - index = maxIdx - static_cast((cmp & 0xFFFF)); - - auto compactTmp = cmp >> moveBits; - auto valueBits - = cub::Traits::TwiddleOut(reinterpret_cast::UnsignedBits&>(compactTmp)); - value = reinterpret_cast(valueBits); - } - - __host__ __device__ TopKRedType() = default; - - __host__ __device__ TopKRedType(T val, int32_t idx) - : compValIdx(makeCmpVal(val, idx)) - { - } - - __host__ __device__ operator TypeCmp() const noexcept - { - return compValIdx; - } - - __device__ inline TypeCmp reduce(cg::thread_block_tile const& warp) - { - if constexpr (!TLLM_GEN_HAS_FAST_REDUX || sizeof(TypeCmp) == 8) - { - return cg::reduce(warp, compValIdx, cg::greater{}); - } - else - { - TypeCmp result; - asm("redux.sync.max.u32 %0, %1, 0xffffffff;\n" : "=r"(result) : "r"(compValIdx)); - return result; - } - } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -template -struct TopKIdx -{ - // by default, empty -}; - -template -struct TopKIdx -{ - static constexpr int K = K_; - int32_t val[K]; -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -#define TOPK_SWAP(I, J) \ - { \ - auto pairMin = min(topK[I].compValIdx, topK[J].compValIdx); \ - auto pairMax = max(topK[I].compValIdx, topK[J].compValIdx); \ - topK[I].compValIdx = pairMax; \ - topK[J].compValIdx = pairMin; \ - } - -template -struct Sort; - -template -struct Sort<1, RedType> -{ - static __device__ void run(RedType* topK) {} -}; - -template -struct Sort<2, RedType> -{ - static __device__ void run(RedType* topK) - { - TOPK_SWAP(0, 1); - } -}; - -template -struct Sort<3, RedType> -{ - static __device__ void run(RedType* topK) - { - TOPK_SWAP(0, 1); - TOPK_SWAP(1, 2); - TOPK_SWAP(0, 1); - } -}; - -template -struct Sort<4, RedType> -{ - static __device__ void run(RedType* topK) - { - TOPK_SWAP(0, 2); - TOPK_SWAP(1, 3); - TOPK_SWAP(0, 1); - TOPK_SWAP(2, 3); - TOPK_SWAP(1, 2); - } -}; - -template -__device__ void reduceTopK(cg::thread_block_tile const& warp, Type (&out)[K], int32_t (&outIdx)[K], - Type (&value)[N], int32_t (&idx)[N], Type minValue) -{ - static_assert(K > 0, "Top K must have K > 0"); - static_assert(K < WARP_SIZE, "Top K must have K < WARP_SIZE"); - static_assert(N > 0, "Top K must have N > 0"); - static_assert(N < 5, "Only support candidates number less than or equal to 128"); - using RedType = TopKRedType; - RedType topK[N]; -#pragma unroll - for (int nn = 0; nn < N; ++nn) - { - topK[nn] = RedType{value[nn], idx[nn]}; - } - - if constexpr (!IsSorted) - { - Sort::run(topK); - } - typename RedType::TypeCmp packedMax{}; -#pragma unroll - for (int kk = 0; kk < K; ++kk) - { - bool update = kk > 0 && packedMax == topK[0].compValIdx; -#pragma unroll - for (int nn = 0; nn < N; ++nn) - { - topK[nn] = update && nn == N - 1 ? RedType{minValue, idx[nn]} : update ? topK[nn + 1] : topK[nn]; - } - // get the next largest value - packedMax = topK[0].reduce(warp); - RedType::unpack(out[kk], outIdx[kk], packedMax); - } -}; - -#undef TOPK_SWAP - -} // end of namespace reduce_topk - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -template -__device__ T calcSoftmax(cg::thread_block_tile const& warp, T score, int32_t laneIdx, int32_t NumTopExperts) -{ - T maxScore = T{-INFINITY}; - if (laneIdx < NumTopExperts) - { - maxScore = score >= maxScore ? score : maxScore; - } - maxScore = cg::reduce(warp, maxScore, cg::greater()); - - float sumScore = float{0.f}; - float newScore; - // Get the summation of scores for each token - if (laneIdx < NumTopExperts) - { - newScore = static_cast(score) - static_cast(maxScore); - newScore = static_cast(exp(newScore)); - sumScore += newScore; - } - sumScore = cg::reduce(warp, sumScore, cg::plus()); - - if (laneIdx < NumTopExperts) - { - score = static_cast(newScore / sumScore); - } - - return score; -} - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -template -__global__ void renormMoeRoutingKernel(InputT* routerLogits, OutputT* topkValues, IdxT* topkIndices, - int32_t const numTokens, int32_t const numExperts, int32_t const topK) -{ - - uint32_t const blockRank = blockIdx.x; - uint32_t const tIdx = BLOCK_SIZE * blockRank + threadIdx.x; - uint32_t const warpIdx = tIdx / WARP_SIZE; - uint32_t const laneIdx = tIdx % WARP_SIZE; - uint32_t const warpNum = gridDim.x * WARPS_PER_BLOCK; - auto block = cg::this_thread_block(); - auto warp = cg::tiled_partition(block); - - InputT minScore = InputT{-INFINITY}; - for (uint32_t tokenId = warpIdx; tokenId < numTokens; tokenId += warpNum) - { - auto scoreOffset = tokenId * numExperts; - auto outputOffset = tokenId * topK; - InputT inputScore[MaxNumExperts / WARP_SIZE]; - IdxT inputIndex[MaxNumExperts / WARP_SIZE]; - - InputT warpTopKScore[MaxNumTopExperts]; - IdxT warpTopKExpertIdx[MaxNumTopExperts]; - - // Load scores and indices for this warp - for (uint32_t i = 0; i < MaxNumExperts / WARP_SIZE; ++i) - { - auto expertIdx = i * WARP_SIZE + laneIdx; - inputScore[i] - = expertIdx < numExperts ? static_cast(routerLogits[scoreOffset + expertIdx]) : minScore; - inputIndex[i] = expertIdx; - } - - // Reduce topK scores and indices for this warp - reduce_topk::reduceTopK(warp, warpTopKScore, warpTopKExpertIdx, inputScore, inputIndex, minScore); - - // Perform softmax on topK scores - auto score = calcSoftmax(warp, - laneIdx < topK ? static_cast(warpTopKScore[laneIdx]) : static_cast(minScore), laneIdx, topK); - if (laneIdx < topK) - { - topkValues[outputOffset + laneIdx] = static_cast(score); - topkIndices[outputOffset + laneIdx] = warpTopKExpertIdx[laneIdx]; - } - } // end for tokenId -} - -int nextPowerOfTwo(int num) -{ - if (num <= 0) - { - return 1; // Handle invalid input - } - int power = 1; - while (power < num) - { - // Check for overflow before shifting - if (power > INT_MAX / 2) - { - return power; - } - power <<= 1; - } - return power; -} - -#define CASE(MAX_NUM_EXPERTS) \ - case MAX_NUM_EXPERTS: \ - switch (maxNumTopExperts) \ - { \ - case 1: kernelInstance = &renormMoeRoutingKernel; break; \ - case 2: kernelInstance = &renormMoeRoutingKernel; break; \ - case 4: kernelInstance = &renormMoeRoutingKernel; break; \ - case 8: kernelInstance = &renormMoeRoutingKernel; break; \ - default: kernelInstance = nullptr; break; \ - } \ - break; - -template -void invokeRenormMoeRouting(InputT* routerLogits, OutputT* topkValues, IdxT* topkIndices, int64_t const numTokens, - int64_t const numExperts, int64_t const topK, cudaStream_t const stream) -{ - - const uint32_t maxNumBlocks = 1024; - const uint32_t numBlocks = std::min(static_cast((numTokens - 1) / WARPS_PER_BLOCK + 1), maxNumBlocks); - - uint32_t maxNumExperts = nextPowerOfTwo(numExperts) < 32 ? 32 : nextPowerOfTwo(numExperts); - uint32_t maxNumTopExperts = nextPowerOfTwo(topK); - - auto* kernelInstance = &renormMoeRoutingKernel; - - switch (maxNumExperts) - { - CASE(32) - CASE(64) - CASE(96) - CASE(128) - default: kernelInstance = nullptr; break; - } - - if (kernelInstance == nullptr) - { - TLLM_CHECK_WITH_INFO(kernelInstance != nullptr, "Can not find corresponding kernel instance."); - } - - dim3 renormMoeRoutingGridDim(numBlocks); - dim3 renormMoeRoutingBlockDim(BLOCK_SIZE); - cudaLaunchConfig_t config; - config.gridDim = renormMoeRoutingGridDim; - config.blockDim = renormMoeRoutingBlockDim; - config.dynamicSmemBytes = 0; - config.stream = stream; - cudaLaunchAttribute attrs[1]; - attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; - attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL(); - config.numAttrs = 1; - config.attrs = attrs; - cudaLaunchKernelEx(&config, kernelInstance, routerLogits, topkValues, topkIndices, static_cast(numTokens), - static_cast(numExperts), static_cast(topK)); - sync_check_cuda_error(stream); -} - -#define INSTANTIATE_RENORM_MOE_ROUTING(InputT, OutputT, IdxT) \ - template void invokeRenormMoeRouting(InputT * routerLogits, OutputT * topkValues, \ - IdxT * topkIndices, int64_t const numTokens, int64_t const numExperts, int64_t const topK, \ - cudaStream_t const stream); - -INSTANTIATE_RENORM_MOE_ROUTING(float, float, int32_t); -INSTANTIATE_RENORM_MOE_ROUTING(half, float, int32_t); -#ifdef ENABLE_BF16 -INSTANTIATE_RENORM_MOE_ROUTING(__nv_bfloat16, float, int32_t); -#endif - -} // namespace tensorrt_llm::kernels diff --git a/cpp/tensorrt_llm/kernels/topkLastDim.cu b/cpp/tensorrt_llm/kernels/topkLastDim.cu index 3371ab4a0f..2e9e0c9179 100644 --- a/cpp/tensorrt_llm/kernels/topkLastDim.cu +++ b/cpp/tensorrt_llm/kernels/topkLastDim.cu @@ -22,9 +22,17 @@ */ #include +#include "moeTopKFuncs.cuh" #include "topkLastDim.h" +#include +#include #include #include +#include +#include +#include +#include +#include namespace tensorrt_llm { @@ -201,12 +209,12 @@ __host__ __device__ IdxT calc_buf_len(IdxT len) * @param len the number of elements to read * @param f the lambda taking two arguments (T x, IdxT idx) */ -template -__device__ void vectorized_process(size_t thread_rank, size_t num_threads, T const* in, idxT len, Func f) +template +__device__ void vectorized_process(size_t thread_rank, size_t num_threads, T const* in, IdxT len, Func f) { if constexpr (sizeof(T) >= sizeof(WideT)) { - for (idxT i = thread_rank; i < len; i += num_threads) + for (IdxT i = thread_rank; i < len; i += num_threads) { f(in[i], i); } @@ -231,12 +239,12 @@ __device__ void vectorized_process(size_t thread_rank, size_t num_threads, T con skip_cnt = len; } WideT const* in_cast = reinterpret_cast(in + skip_cnt); - const idxT len_cast = (len - skip_cnt) / items_per_scalar; + const IdxT len_cast = (len - skip_cnt) / items_per_scalar; - for (idxT i = thread_rank; i < len_cast; i += num_threads) + for (IdxT i = thread_rank; i < len_cast; i += num_threads) { wide.scalar = in_cast[i]; - const idxT real_i = skip_cnt + i * items_per_scalar; + const IdxT real_i = skip_cnt + i * items_per_scalar; #pragma unroll for (int j = 0; j < items_per_scalar; ++j) { @@ -256,7 +264,7 @@ __device__ void vectorized_process(size_t thread_rank, size_t num_threads, T con // and so // len - (skip_cnt + len_cast * items_per_scalar) < items_per_scalar <= WARP_SIZE // no need to use loop - const idxT remain_i = skip_cnt + len_cast * items_per_scalar + thread_rank; + const IdxT remain_i = skip_cnt + len_cast * items_per_scalar + thread_rank; if (remain_i < len) { f(in[remain_i], remain_i); @@ -265,14 +273,14 @@ __device__ void vectorized_process(size_t thread_rank, size_t num_threads, T con } // sync_width should >= WARP_SIZE -template -__device__ void vectorized_process(T const* in, idxT len, Func f, int sync_width) +template +__device__ void vectorized_process(T const* in, IdxT len, Func f, int sync_width) { - const idxT stride = blockDim.x * gridDim.x; - const idxT tid = blockIdx.x * blockDim.x + threadIdx.x; + const IdxT stride = blockDim.x * gridDim.x; + const IdxT tid = blockIdx.x * blockDim.x + threadIdx.x; if constexpr (sizeof(T) >= sizeof(WideT)) { - for (idxT i = tid; i < len; i += stride) + for (IdxT i = tid; i < len; i += stride) { f(in[i], i, true); } @@ -296,17 +304,17 @@ __device__ void vectorized_process(T const* in, idxT len, Func f, int sync_width skip_cnt = len; } WideT const* in_cast = reinterpret_cast(in + skip_cnt); - const idxT len_cast = (len - skip_cnt) / items_per_scalar; + const IdxT len_cast = (len - skip_cnt) / items_per_scalar; - const idxT len_cast_for_sync = ((len_cast - 1) / sync_width + 1) * sync_width; - for (idxT i = tid; i < len_cast_for_sync; i += stride) + const IdxT len_cast_for_sync = ((len_cast - 1) / sync_width + 1) * sync_width; + for (IdxT i = tid; i < len_cast_for_sync; i += stride) { bool valid = i < len_cast; if (valid) { wide.scalar = in_cast[i]; } - const idxT real_i = skip_cnt + i * items_per_scalar; + const IdxT real_i = skip_cnt + i * items_per_scalar; #pragma unroll for (int j = 0; j < items_per_scalar; ++j) { @@ -323,7 +331,7 @@ __device__ void vectorized_process(T const* in, idxT len, Func f, int sync_width T value = valid ? in[tid] : T(); f(value, tid, valid); - const idxT remain_i = skip_cnt + len_cast * items_per_scalar + tid; + const IdxT remain_i = skip_cnt + len_cast * items_per_scalar + tid; valid = remain_i < len; value = valid ? in[remain_i] : T(); f(value, remain_i, valid); @@ -1164,6 +1172,77 @@ __global__ void radix_topk_one_block_kernel(T const* in, IdxT const* in_idx, con } // namespace air_topk_stable //} +namespace moe_topk +{ +namespace cg = cooperative_groups; +static constexpr int kBLOCK_SIZE = 1024; +static constexpr int kWARP_SIZE = 32; +static constexpr int kWARPS_PER_BLOCK = kBLOCK_SIZE / kWARP_SIZE; + +template +__device__ T negativeInfinity() +{ + return -INFINITY; +} + +template <> +__device__ half negativeInfinity() +{ + return -CUDART_INF_FP16; +} + +template <> +__device__ __nv_bfloat16 negativeInfinity<__nv_bfloat16>() +{ + return -CUDART_INF_BF16; +} + +/****************TopK kernel for candidate number<= 128 and K <= 8 **************** */ +template +__global__ void moe_topk_kernel( + InputT const* in, OutputT* out, IdxT* outIdx, int32_t const batchSize, int32_t const len, int32_t const topK) +{ + + uint32_t const blockRank = blockIdx.x; + uint32_t const tIdx = kBLOCK_SIZE * blockRank + threadIdx.x; + uint32_t const warpIdx = tIdx / kWARP_SIZE; + uint32_t const laneIdx = tIdx % kWARP_SIZE; + uint32_t const warpNum = gridDim.x * kWARPS_PER_BLOCK; + auto block = cg::this_thread_block(); + auto warp = cg::tiled_partition(block); + + InputT minScore = negativeInfinity(); + + for (uint32_t tokenId = warpIdx; tokenId < batchSize; tokenId += warpNum) + { + auto scoreOffset = tokenId * len; + auto outputOffset = tokenId * topK; + InputT inputScore[MaxLen / kWARP_SIZE]; + IdxT inputIndex[MaxLen / kWARP_SIZE]; + + InputT warpTopKScore[MaxTopK]; + IdxT warpTopKExpertIdx[MaxTopK]; + + // Load scores and indices for this warp + for (uint32_t i = 0; i < MaxLen / kWARP_SIZE; ++i) + { + auto expertIdx = i * kWARP_SIZE + laneIdx; + inputScore[i] = expertIdx < len ? static_cast(in[scoreOffset + expertIdx]) : minScore; + inputIndex[i] = expertIdx; + } + + // Reduce topK scores and indices for this warp + tensorrt_llm::kernels::reduce_topk::reduceTopK( + warp, warpTopKScore, warpTopKExpertIdx, inputScore, inputIndex, minScore); + + if (laneIdx < topK) + { + out[outputOffset + laneIdx] = static_cast(warpTopKScore[laneIdx]); + outIdx[outputOffset + laneIdx] = warpTopKExpertIdx[laneIdx]; + } + } // end for tokenId +} +} // namespace moe_topk /***************Runtime API****************/ @@ -1221,9 +1300,11 @@ void standalone_stable_radix_topk_(void* buf, size_t& buf_size, T const* in, Idx IdxT* sort_in_idx = nullptr; air_topk_stable::ComputeOffset computeoffset(k); - cub::CountingInputIterator counting_iter(0); - cub::TransformInputIterator, cub::CountingInputIterator> - transform_iter(counting_iter, computeoffset); + + thrust::counting_iterator counting_iter(0); + thrust::transform_iterator, thrust::counting_iterator> transform_iter( + counting_iter, computeoffset); + cub::DeviceSegmentedSort::SortPairs(NULL, temp_storage_bytes, out_idx, out_idx, out, out, k * batch_size, batch_size, transform_iter, transform_iter + 1, stream); if (sorted) @@ -1348,9 +1429,9 @@ void standalone_stable_radix_topk_one_block_(void* buf, size_t& buf_size, T cons const IdxT buf_len = air_topk_stable::calc_buf_len(len); air_topk_stable::ComputeOffset computeoffset(k); - cub::CountingInputIterator counting_iter(0); - cub::TransformInputIterator, cub::CountingInputIterator> - transform_iter(counting_iter, computeoffset); + thrust::counting_iterator counting_iter(0); + thrust::transform_iterator, thrust::counting_iterator> transform_iter( + counting_iter, computeoffset); cub::DeviceSegmentedSort::SortPairs(NULL, temp_storage_bytes, out_idx, out_idx, out, out, k * batch_size, batch_size, transform_iter, transform_iter + 1, stream); @@ -1421,36 +1502,120 @@ void standalone_stable_radix_topk_one_block_(void* buf, size_t& buf_size, T cons } } -template -void standalone_stable_radix_11bits(void* buf, size_t& buf_size, T const* in, int batch_size, idxT len, idxT k, T* out, - idxT* out_idx, bool greater, cudaStream_t stream = 0) +template +void standalone_stable_radix_11bits(void* buf, size_t& buf_size, T const* in, int batch_size, IdxT len, IdxT k, T* out, + IdxT* out_idx, bool greater, cudaStream_t stream = 0) { constexpr int items_per_thread = 32; constexpr int block_dim = 512; constexpr bool fused_last_filter = false; if (len <= block_dim * items_per_thread) { - standalone_stable_radix_topk_one_block_( - buf, buf_size, in, static_cast(nullptr), batch_size, len, k, out, out_idx, !greater, stream, sorted); + standalone_stable_radix_topk_one_block_( + buf, buf_size, in, static_cast(nullptr), batch_size, len, k, out, out_idx, !greater, stream, sorted); } else { int sm_cnt = tensorrt_llm::common::getMultiProcessorCount(); - unsigned grid_dim = air_topk_stable::calc_grid_dim(batch_size, len, sm_cnt); + unsigned grid_dim = air_topk_stable::calc_grid_dim(batch_size, len, sm_cnt); if (grid_dim == 1) { - standalone_stable_radix_topk_one_block_(buf, buf_size, in, - static_cast(nullptr), batch_size, len, k, out, out_idx, !greater, stream, sorted); + standalone_stable_radix_topk_one_block_(buf, buf_size, in, + static_cast(nullptr), batch_size, len, k, out, out_idx, !greater, stream, sorted); } else { - standalone_stable_radix_topk_(buf, buf_size, in, static_cast(nullptr), + standalone_stable_radix_topk_(buf, buf_size, in, static_cast(nullptr), batch_size, len, k, out, out_idx, !greater, fused_last_filter, grid_dim, stream, sorted); } } } +int nextPowerOfTwo(int num) +{ + if (num <= 0) + { + return 1; // Handle invalid input + } + int power = 1; + while (power < num) + { + // Check for overflow before shifting + if (power > INT_MAX / 2) + { + return power; + } + power <<= 1; + } + return power; +} + +template +void moe_reduce_topk( + T const* in, int batch_size, IdxT len, IdxT k, T* out, IdxT* out_idx, bool greater, cudaStream_t stream = 0) +{ + using InputT = T; + using OutputT = T; + const uint32_t max_num_blocks = 1024; + const uint32_t num_blocks + = std::min(static_cast((batch_size - 1) / moe_topk::kWARPS_PER_BLOCK + 1), max_num_blocks); + + uint32_t max_len = nextPowerOfTwo(len) < 32 ? 32 : nextPowerOfTwo(len); + uint32_t moe_topk = nextPowerOfTwo(k); + + auto* kernel_instance = &moe_topk::moe_topk_kernel; + + switch (max_len) + { + case 32: + switch (moe_topk) + { + case 1: kernel_instance = &moe_topk::moe_topk_kernel; break; + case 2: kernel_instance = &moe_topk::moe_topk_kernel; break; + case 4: kernel_instance = &moe_topk::moe_topk_kernel; break; + case 8: kernel_instance = &moe_topk::moe_topk_kernel; break; + default: kernel_instance = nullptr; break; + } + break; + case 64: + switch (moe_topk) + { + case 1: kernel_instance = &moe_topk::moe_topk_kernel; break; + case 2: kernel_instance = &moe_topk::moe_topk_kernel; break; + case 4: kernel_instance = &moe_topk::moe_topk_kernel; break; + case 8: kernel_instance = &moe_topk::moe_topk_kernel; break; + default: kernel_instance = nullptr; break; + } + break; + case 96: + switch (moe_topk) + { + case 1: kernel_instance = &moe_topk::moe_topk_kernel; break; + case 2: kernel_instance = &moe_topk::moe_topk_kernel; break; + case 4: kernel_instance = &moe_topk::moe_topk_kernel; break; + case 8: kernel_instance = &moe_topk::moe_topk_kernel; break; + default: kernel_instance = nullptr; break; + } + break; + case 128: + switch (moe_topk) + { + case 1: kernel_instance = &moe_topk::moe_topk_kernel; break; + case 2: kernel_instance = &moe_topk::moe_topk_kernel; break; + case 4: kernel_instance = &moe_topk::moe_topk_kernel; break; + case 8: kernel_instance = &moe_topk::moe_topk_kernel; break; + default: kernel_instance = nullptr; break; + } + break; + default: kernel_instance = nullptr; break; + } + + dim3 moe_topk_grid_dim(num_blocks); + dim3 moe_topk_block_dim(moe_topk::kBLOCK_SIZE); + + kernel_instance<<>>(in, out, out_idx, batch_size, len, k); +} #endif /////////////// @@ -1459,22 +1624,22 @@ template size_t invokeComputeTopkLastDimWorkspaceSize( SizeType32 batchSize, SizeType32 inputLength, SizeType32 k, bool is_largest) { - using idxT = SizeType32; + using IdxT = SizeType32; size_t buf_size = 0; void* workspace = nullptr; T const* in = nullptr; T* out_val = nullptr; - idxT* out_idx = nullptr; + IdxT* out_idx = nullptr; constexpr int block_dim = 512; constexpr bool fused_last_filter = false; constexpr bool sorted = true; int sm_cnt = tensorrt_llm::common::getMultiProcessorCount(); - unsigned grid_dim = air_topk_stable::calc_grid_dim(batchSize, inputLength, sm_cnt); + unsigned grid_dim = air_topk_stable::calc_grid_dim(batchSize, inputLength, sm_cnt); - standalone_stable_radix_topk_(workspace, buf_size, in, static_cast(nullptr), + standalone_stable_radix_topk_(workspace, buf_size, in, static_cast(nullptr), batchSize, inputLength, k, out_val, out_idx, !is_largest, fused_last_filter, grid_dim, 0, sorted); return buf_size; } @@ -1504,8 +1669,17 @@ void invokeTopkLastDim(SizeType32 batchSize, SizeType32 inputLength, SizeType32 T const* in = reinterpret_cast(input); T* out_val_ = reinterpret_cast(out_val); SizeType32* out_idx_ = reinterpret_cast(out_idx); - standalone_stable_radix_11bits( - workspace, buf_size, in, batchSize, inputLength, k, out_val_, out_idx_, is_largest, stream); + if (inputLength <= 128 && k <= 8 && is_largest == true) + { + // This method does not require a buffer, but since the implementation may vary in different cases, + // we still allocate the buffer in case AIR TopK is used instead. + moe_reduce_topk(in, batchSize, inputLength, k, out_val_, out_idx_, !is_largest, stream); + } + else + { + standalone_stable_radix_11bits( + workspace, buf_size, in, batchSize, inputLength, k, out_val_, out_idx_, is_largest, stream); + } } #define INSTANTIATE_TOPK_LastDim_DATA_TYPE(T) \ diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.cuh b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.cuh index 750658fad7..92d020fd19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.cuh +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.cuh @@ -378,7 +378,7 @@ __device__ void routingPermutation(KernelParams params, PackedScoreIdx // We can't do it earlier because FC1 depends on the mPtrCtaIdxXyToBatchIdx, // mPtrCtaIdxXyToMnLimit, mPtrNumNonExitingCtas and mPtrTotalNumPaddedTokens // TODO: this is not sufficient to ensure visibility in the next kernel! -#if !defined(PDL_PROFILE) || PDL_PROFILE == 0 +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) if constexpr (KernelParams::UsePdl) { cudaTriggerProgrammaticLaunchCompletion(); @@ -757,15 +757,13 @@ __global__ void __launch_bounds__(NumThreadsHist) routingIndicesOffsetsKernel(Ke } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) -// Trigger secondary kernel. -// Note: this does not guarantee the visibility of prior writes unless the consumer executes a -// dependency sync. -#if !defined(PDL_PROFILE) || PDL_PROFILE == 0 + // Trigger secondary kernel. + // Note: this does not guarantee the visibility of prior writes unless the consumer executes a + // dependency sync. if constexpr (KernelParams::UsePdl) { cudaTriggerProgrammaticLaunchCompletion(); } -#endif #endif // if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) } diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingLlama4.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingLlama4.cu index f1f60abdc2..5c39892039 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingLlama4.cu +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingLlama4.cu @@ -227,13 +227,11 @@ __global__ void __launch_bounds__(WarpSize) routingIndicesWarpKernel(KernelParam } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) -#if !defined(PDL_PROFILE) || PDL_PROFILE == 0 // we can trigger the next kernel at this point if constexpr (KernelParams::UsePdl) { cudaTriggerProgrammaticLaunchCompletion(); } -#endif #endif // at this point, all values for offsets are ready, except the final offsets diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu index f03e02c2e2..f6364e0cc9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu @@ -199,13 +199,11 @@ __global__ void __launch_bounds__(NumThreadsSingleBlock) routingIndicesBlockKern } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) -#if !defined(PDL_PROFILE) || PDL_PROFILE == 0 // we can trigger the next kernel at this point if constexpr (KernelParams::UsePdl) { cudaTriggerProgrammaticLaunchCompletion(); } -#endif #endif for (int tokenIdx = 0; tokenIdx < params.mNumTokens; tokenIdx++) diff --git a/cpp/tensorrt_llm/thop/CMakeLists.txt b/cpp/tensorrt_llm/thop/CMakeLists.txt index 494788c228..977ae7f915 100644 --- a/cpp/tensorrt_llm/thop/CMakeLists.txt +++ b/cpp/tensorrt_llm/thop/CMakeLists.txt @@ -83,7 +83,7 @@ add_library( reducescatterOp.cpp relativeAttentionBiasOp.cpp dsv3RouterGemmOp.cpp - renormMoeRoutingOp.cpp + customMoeRoutingOp.cpp selectiveScanOp.cpp userbuffersFinalizeOp.cpp userbuffersTensor.cpp diff --git a/cpp/tensorrt_llm/thop/renormMoeRoutingOp.cpp b/cpp/tensorrt_llm/thop/customMoeRoutingOp.cpp similarity index 75% rename from cpp/tensorrt_llm/thop/renormMoeRoutingOp.cpp rename to cpp/tensorrt_llm/thop/customMoeRoutingOp.cpp index 616cf3bb7e..814fdf87c3 100644 --- a/cpp/tensorrt_llm/thop/renormMoeRoutingOp.cpp +++ b/cpp/tensorrt_llm/thop/customMoeRoutingOp.cpp @@ -15,7 +15,7 @@ */ #include "tensorrt_llm/common/opUtils.h" -#include "tensorrt_llm/kernels/renormMoeRoutingKernels.h" +#include "tensorrt_llm/kernels/customMoeRoutingKernels.h" #include "tensorrt_llm/runtime/torchUtils.h" namespace th = torch; @@ -25,7 +25,8 @@ namespace tk = tensorrt_llm::kernels; namespace torch_ext { -std::tuple renorm_moe_routing_op(th::Tensor const& router_logits, int64_t topk) +template +std::tuple custom_moe_routing_op(th::Tensor const& router_logits, int64_t topk) { auto data_type = router_logits.scalar_type(); auto input_size = router_logits.sizes(); @@ -44,20 +45,22 @@ std::tuple renorm_moe_routing_op(th::Tensor const& route { case torch::kFloat32: // Handle Float32 - tk::invokeRenormMoeRouting(reinterpret_cast(router_logits.mutable_data_ptr()), + tk::invokeRenormMoeRouting( + reinterpret_cast(router_logits.mutable_data_ptr()), reinterpret_cast(topk_values.mutable_data_ptr()), reinterpret_cast(topk_indices.mutable_data_ptr()), num_tokens, num_experts, topk, stream); break; case torch::kBFloat16: // Handle BFloat16 - tk::invokeRenormMoeRouting<__nv_bfloat16, float, int32_t>( + tk::invokeRenormMoeRouting<__nv_bfloat16, float, int32_t, DoSoftmaxBeforeTopK>( reinterpret_cast<__nv_bfloat16*>(router_logits.mutable_data_ptr()), reinterpret_cast(topk_values.mutable_data_ptr()), reinterpret_cast(topk_indices.mutable_data_ptr()), num_tokens, num_experts, topk, stream); break; case torch::kHalf: // Handle Half - tk::invokeRenormMoeRouting(reinterpret_cast(router_logits.mutable_data_ptr()), + tk::invokeRenormMoeRouting( + reinterpret_cast(router_logits.mutable_data_ptr()), reinterpret_cast(topk_values.mutable_data_ptr()), reinterpret_cast(topk_indices.mutable_data_ptr()), num_tokens, num_experts, topk, stream); break; @@ -69,6 +72,15 @@ std::tuple renorm_moe_routing_op(th::Tensor const& route return {topk_indices, topk_values}; } +std::tuple renorm_moe_routing_op(th::Tensor const& router_logits, int64_t topk) +{ + return custom_moe_routing_op(router_logits, topk); +} + +std::tuple default_moe_routing_op(th::Tensor const& router_logits, int64_t topk) +{ + return custom_moe_routing_op(router_logits, topk); +} } // namespace torch_ext TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -82,3 +94,15 @@ TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { m.impl("renorm_moe_routing_op", &torch_ext::renorm_moe_routing_op); } + +TORCH_LIBRARY_FRAGMENT(trtllm, m) +{ + m.def( + "default_moe_routing_op(Tensor router_logits, SymInt topk" + ") -> (Tensor, Tensor)"); +} + +TORCH_LIBRARY_IMPL(trtllm, CUDA, m) +{ + m.impl("default_moe_routing_op", &torch_ext::default_moe_routing_op); +} diff --git a/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py index ba71e4fbfe..098af11fc8 100644 --- a/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py @@ -531,3 +531,11 @@ def _register_fake(): return router_logits.new_empty( sz, dtype=torch.int32), router_logits.new_empty(sz, dtype=torch.float32) + + @torch.library.register_fake("trtllm::default_moe_routing_op") + def _(router_logits, topk): + num_tokens = router_logits.shape[0] + sz = (num_tokens, topk) + return router_logits.new_empty( + sz, dtype=torch.int32), router_logits.new_empty(sz, + dtype=torch.float32) diff --git a/tensorrt_llm/_torch/modules/fused_moe/routing.py b/tensorrt_llm/_torch/modules/fused_moe/routing.py index 34c2179593..635091c7ad 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/routing.py +++ b/tensorrt_llm/_torch/modules/fused_moe/routing.py @@ -183,18 +183,28 @@ class BaseMoeRoutingMethod(nn.Module): class DefaultMoeRoutingMethod(BaseMoeRoutingMethod): - def __init__(self, top_k: int): + def __init__(self, top_k: int, force_enable_pytorch_op: bool = False): super().__init__() self.top_k = top_k + self.force_enable_pytorch_op = force_enable_pytorch_op - def apply(self, - router_logits: torch.Tensor) -> (torch.Tensor, torch.Tensor): + def apply_pytorch( + self, router_logits: torch.Tensor) -> (torch.Tensor, torch.Tensor): topk_values, topk_indices = torch.topk(torch.nn.functional.softmax( router_logits.float(), dim=-1), k=self.top_k, dim=-1) return topk_indices.to(torch.int32), topk_values + def apply(self, + router_logits: torch.Tensor) -> (torch.Tensor, torch.Tensor): + num_experts = router_logits.shape[-1] + if self.force_enable_pytorch_op or num_experts > 128 or self.top_k > 8: + return self.apply_pytorch(router_logits) + else: + return torch.ops.trtllm.default_moe_routing_op( + router_logits, self.top_k) + @property def routing_method_type(self): return RoutingMethodType.Default From 90bfc8cc29e3ee5ecc18ea3f7eb1825e34da0da5 Mon Sep 17 00:00:00 2001 From: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com> Date: Thu, 21 Aug 2025 17:58:30 +0800 Subject: [PATCH 10/33] [https://nvbugs/5453827][fix] Fix RPATH of th_common shared library to find pip-installed NCCL (#6984) Signed-off-by: Yuan Tong <13075180+tongyuantongyu@users.noreply.github.com> --- cpp/tensorrt_llm/CMakeLists.txt | 3 +-- cpp/tensorrt_llm/thop/CMakeLists.txt | 6 +++--- jenkins/L0_Test.groovy | 5 ----- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt index c4814c1d4e..2e625f4687 100644 --- a/cpp/tensorrt_llm/CMakeLists.txt +++ b/cpp/tensorrt_llm/CMakeLists.txt @@ -294,8 +294,7 @@ if(TARGET ${NIXL_WRAPPER_TARGET}) endif() if(NOT WIN32) - set_target_properties(${SHARED_TARGET} PROPERTIES LINK_FLAGS - "-Wl,-rpath='$ORIGIN'") + set_target_properties(${SHARED_TARGET} PROPERTIES BUILD_RPATH "$ORIGIN") endif() if(BUILD_PYT) diff --git a/cpp/tensorrt_llm/thop/CMakeLists.txt b/cpp/tensorrt_llm/thop/CMakeLists.txt index 977ae7f915..6224c0d2c9 100644 --- a/cpp/tensorrt_llm/thop/CMakeLists.txt +++ b/cpp/tensorrt_llm/thop/CMakeLists.txt @@ -119,9 +119,9 @@ endif() if(NOT WIN32) set_target_properties( - th_common - PROPERTIES LINK_FLAGS - "-Wl,-rpath='$ORIGIN' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}") + th_common PROPERTIES BUILD_RPATH "$ORIGIN;$ORIGIN/../../nvidia/nccl/lib") + set_target_properties( + th_common PROPERTIES LINK_FLAGS "${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}") else() target_link_libraries(th_common PRIVATE context_attention_src) endif() diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index f43d454ac8..e99081d1c8 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -2079,11 +2079,6 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) checkPipStage = true } - if (cpu_arch == AARCH64_TRIPLE && values[5] != DLFW_IMAGE) { - checkPipStage = false - echo "Skip pip install sanity check due to https://nvbugs/5453827" - } - if (checkPipStage) { stage("Run LLMAPI tests") { pipInstallSanitySpec = createKubernetesPodConfig(values[5], gpu_type, k8s_arch) From 9a2b44d0f2336d3e7085076968a9b09fc05158db Mon Sep 17 00:00:00 2001 From: brb-nv <169953907+brb-nv@users.noreply.github.com> Date: Thu, 21 Aug 2025 08:21:27 -0700 Subject: [PATCH 11/33] [None][chore] No-op changes to support context parallelism in disaggregated serving later (#7063) Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> --- .../executor/dataTransceiverState.h | 32 ++- .../batch_manager/cacheFormatter.cpp | 8 + .../batch_manager/mlaCacheFormatter.cpp | 14 +- cpp/tensorrt_llm/executor/serialization.cpp | 7 +- .../batch_manager/cacheTransceiverTest.cpp | 208 ++++++++++-------- .../unit_tests/executor/agentCommTest.cpp | 2 +- .../executor/serializeUtilsTest.cpp | 2 +- 7 files changed, 155 insertions(+), 118 deletions(-) diff --git a/cpp/include/tensorrt_llm/executor/dataTransceiverState.h b/cpp/include/tensorrt_llm/executor/dataTransceiverState.h index 2a2f1f8369..98b26a276c 100644 --- a/cpp/include/tensorrt_llm/executor/dataTransceiverState.h +++ b/cpp/include/tensorrt_llm/executor/dataTransceiverState.h @@ -52,29 +52,30 @@ public: AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2) : mModelConfig(std::move(modelConfig)) , mParallelConfig{worldConfig.getTensorParallelism(), worldConfig.getPipelineParallelism(), - worldConfig.enableAttentionDP(), worldConfig.getTensorParallelRank(), worldConfig.getTensorParallelism()} + worldConfig.getContextParallelism(), worldConfig.enableAttentionDP(), worldConfig.getTensorParallelRank(), + worldConfig.getTensorParallelism()} , mDataType{dataType} , mAttentionConfig(attentionType, kvFactor) { } CacheState(std::vector nbKvHeadPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock, - SizeType32 tensorParallelism, SizeType32 pipelineParallelism, nvinfer1::DataType dataType, - AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableAttentionDP = false, - int DPrank = 0, int DPsize = 0) + SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism, + nvinfer1::DataType dataType, AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, + bool enableAttentionDP = false, int DPrank = 0, int DPsize = 0) : mModelConfig{std::move(nbKvHeadPerLayer), sizePerHead, tokensPerBlock} - , mParallelConfig{tensorParallelism, pipelineParallelism, enableAttentionDP, DPrank, DPsize} + , mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize} , mDataType{dataType} , mAttentionConfig(attentionType, kvFactor) { } CacheState(SizeType32 nbAttentionLayers, SizeType32 nbKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock, - SizeType32 tensorParallelism, SizeType32 pipelineParallelism, nvinfer1::DataType dataType, - AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableAttentionDP = false, - int DPrank = 0, int DPsize = 0) + SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism, + nvinfer1::DataType dataType, AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, + bool enableAttentionDP = false, int DPrank = 0, int DPsize = 0) : mModelConfig{std::vector(nbAttentionLayers, nbKvHeads), sizePerHead, tokensPerBlock} - , mParallelConfig{tensorParallelism, pipelineParallelism, enableAttentionDP, DPrank, DPsize} + , mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize} , mDataType{dataType} , mAttentionConfig(attentionType, kvFactor) { @@ -83,7 +84,7 @@ public: [[nodiscard]] bool operator==(kv_cache::CacheState const& other) const noexcept { return mModelConfig == other.mModelConfig && mParallelConfig == other.mParallelConfig - && mDataType == other.mDataType; + && mAttentionConfig == other.mAttentionConfig && mDataType == other.mDataType; } struct ModelConfig @@ -103,6 +104,7 @@ public: { SizeType32 mTensorParallelism; SizeType32 mPipelineParallelism; + SizeType32 mContextParallelism; bool mEnableAttentionDP; SizeType32 mDPrank; SizeType32 mDPsize; @@ -110,8 +112,8 @@ public: [[nodiscard]] bool operator==(ParallelConfig const& other) const noexcept { return mTensorParallelism == other.mTensorParallelism && mPipelineParallelism == other.mPipelineParallelism - && mEnableAttentionDP == other.mEnableAttentionDP && mDPrank == other.mDPrank - && mDPsize == other.mDPsize; + && mContextParallelism == other.mContextParallelism && mEnableAttentionDP == other.mEnableAttentionDP + && mDPrank == other.mDPrank && mDPsize == other.mDPsize; } }; @@ -125,6 +127,11 @@ public: { } + [[nodiscard]] bool operator==(AttentionConfig const& other) const noexcept + { + return mAttentionType == other.mAttentionType && mKvFactor == other.mKvFactor; + } + // attentionType ; AttentionType mAttentionType; int mKvFactor; @@ -162,6 +169,7 @@ public: sstring << "mTokensPerBlock:" << mModelConfig.mTokensPerBlock << "\n"; sstring << "tp:" << mParallelConfig.mTensorParallelism << "\n"; sstring << "pp:" << mParallelConfig.mPipelineParallelism << "\n"; + sstring << "cp:" << mParallelConfig.mContextParallelism << "\n"; sstring << "enableAttentionDP:" << mParallelConfig.mEnableAttentionDP << "\n"; sstring << "datatype:" << static_cast(mDataType) << "\n"; sstring << "attentionType:" << static_cast(mAttentionConfig.mAttentionType) << "\n"; diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp index 503c2e6c5d..e73e0f1541 100644 --- a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp +++ b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp @@ -822,6 +822,14 @@ void CacheFormatter::unformat(TransferSession& session) TLLM_LOG_WARNING("CacheFormatter::inquireSupport: only support non-MLA"); return false; } + if (selfConfig.getParallelConfig().mContextParallelism != 1 + || destConfig.getParallelConfig().mContextParallelism != 1) + { + TLLM_LOG_WARNING( + "CacheFormatter::inquireSupport: context parallelism is not currently supported (selfCP=%d, destCP=%d).", + selfConfig.getParallelConfig().mContextParallelism, destConfig.getParallelConfig().mContextParallelism); + return false; + } std::unordered_set setVecDest{ destConfig.getModelConfig().mNbKvHeadsPerLayer.begin(), destConfig.getModelConfig().mNbKvHeadsPerLayer.end()}; diff --git a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp index 22756f2552..eaa2e957e8 100644 --- a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp +++ b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp @@ -558,18 +558,20 @@ void MLACacheFormatter::unformat(TransferSession& session) TLLM_LOG_WARNING("MLACacheFormatter::inquireSupport: only support MLA"); return false; } - - if (selfConfig.getAttentionConfig().mKvFactor != destConfig.getAttentionConfig().mKvFactor) - { - TLLM_LOG_WARNING("MLACacheFormatter::inquireSupport: only support same kv factor"); - return false; - } if (selfConfig.getParallelConfig().mEnableAttentionDP && (selfConfig.getParallelConfig().mTensorParallelism % selfConfig.getParallelConfig().mDPsize != 0)) { TLLM_LOG_WARNING("MLACacheFormatter::inquireSupport: TP size must be divisible by DP size"); return false; } + if (selfConfig.getParallelConfig().mContextParallelism != 1 + || destConfig.getParallelConfig().mContextParallelism != 1) + { + TLLM_LOG_WARNING( + "MLACacheFormatter::inquireSupport: context parallelism is not currently supported (selfCP=%d, destCP=%d).", + selfConfig.getParallelConfig().mContextParallelism, destConfig.getParallelConfig().mContextParallelism); + return false; + } if (destConfig.getParallelConfig().mEnableAttentionDP && (destConfig.getParallelConfig().mTensorParallelism % destConfig.getParallelConfig().mDPsize != 0)) { diff --git a/cpp/tensorrt_llm/executor/serialization.cpp b/cpp/tensorrt_llm/executor/serialization.cpp index 738a095eef..bba8d19e2f 100644 --- a/cpp/tensorrt_llm/executor/serialization.cpp +++ b/cpp/tensorrt_llm/executor/serialization.cpp @@ -531,14 +531,15 @@ kv_cache::CacheState Serialization::deserializeCacheState(std::istream& is) auto tokensPerBlock = su::deserialize(is); auto tensorParallelism = su::deserialize(is); auto pipelineParallelism = su::deserialize(is); + auto contextParallelism = su::deserialize(is); auto enableAttentionDP = su::deserialize(is); auto DPrank = su::deserialize(is); auto DPsize = su::deserialize(is); auto dataType = su::deserialize(is); auto attentionType = su::deserialize(is); auto kvFactor = su::deserialize(is); - return CacheState{nbKvHeadsPerLayer, sizePerHead, tokensPerBlock, tensorParallelism, pipelineParallelism, dataType, - attentionType, kvFactor, enableAttentionDP, DPrank, DPsize}; + return CacheState{nbKvHeadsPerLayer, sizePerHead, tokensPerBlock, tensorParallelism, pipelineParallelism, + contextParallelism, dataType, attentionType, kvFactor, enableAttentionDP, DPrank, DPsize}; } void Serialization::serialize(kv_cache::CacheState const& state, std::ostream& os) @@ -548,6 +549,7 @@ void Serialization::serialize(kv_cache::CacheState const& state, std::ostream& o su::serialize(state.mModelConfig.mTokensPerBlock, os); su::serialize(state.mParallelConfig.mTensorParallelism, os); su::serialize(state.mParallelConfig.mPipelineParallelism, os); + su::serialize(state.mParallelConfig.mContextParallelism, os); su::serialize(state.mParallelConfig.mEnableAttentionDP, os); su::serialize(state.mParallelConfig.mDPrank, os); su::serialize(state.mParallelConfig.mDPsize, os); @@ -564,6 +566,7 @@ size_t Serialization::serializedSize(kv_cache::CacheState const& state) totalSize += su::serializedSize(state.mModelConfig.mTokensPerBlock); totalSize += su::serializedSize(state.mParallelConfig.mTensorParallelism); totalSize += su::serializedSize(state.mParallelConfig.mPipelineParallelism); + totalSize += su::serializedSize(state.mParallelConfig.mContextParallelism); totalSize += su::serializedSize(state.mParallelConfig.mEnableAttentionDP); totalSize += su::serializedSize(state.mParallelConfig.mDPrank); totalSize += su::serializedSize(state.mParallelConfig.mDPsize); diff --git a/cpp/tests/batch_manager/cacheTransceiverTest.cpp b/cpp/tests/batch_manager/cacheTransceiverTest.cpp index 99c40f810f..4b513ae57f 100644 --- a/cpp/tests/batch_manager/cacheTransceiverTest.cpp +++ b/cpp/tests/batch_manager/cacheTransceiverTest.cpp @@ -99,7 +99,7 @@ TEST_F(RequestInfoTest, Basic) } auto state = std::make_unique(); state->setCommState(texec::kv_cache::CommState{12, "127.0.0.1"}); - state->setCacheState(texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, nvinfer1::DataType::kFLOAT}); + state->setCacheState(texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 8, nvinfer1::DataType::kFLOAT}); RequestInfo info{1, *state}; auto info2 = serializeDeserialize(info); EXPECT_EQ(info, info2); @@ -133,7 +133,7 @@ TEST_F(CacheConfigTest, EqualTo) constexpr SizeType32 tokensPerBlock{64}; constexpr SizeType32 tensorParallelism{8}; constexpr SizeType32 pipelineParallelism{2}; - constexpr SizeType32 contextParallelism{1}; + constexpr SizeType32 contextParallelism{2}; constexpr SizeType32 sizePerHead{hiddenSize / nbHeads}; constexpr CacheState::AttentionType attentionType{CacheState::AttentionType::kDEFAULT}; constexpr int kvFactor = 2; @@ -148,7 +148,7 @@ TEST_F(CacheConfigTest, EqualTo) texec::kv_cache::CacheState state0{ cacheStateCfg, worldConfig, modelConfig.getKvDataType(), attentionType, kvFactor}; texec::kv_cache::CacheState state1{nbAttentionLayers, nbHeads, sizePerHead, tokensPerBlock, tensorParallelism, - pipelineParallelism, dtype, attentionType, kvFactor, false, 0, tensorParallelism}; + pipelineParallelism, contextParallelism, dtype, attentionType, kvFactor, false, 0, tensorParallelism}; EXPECT_EQ(state0, state1); } @@ -165,7 +165,7 @@ public: ON_CALL(*this, recvRequestInfo) .WillByDefault(Return(RequestInfo{0, texec::DataTransceiverState{ - texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, nvinfer1::DataType::kFLOAT}, + texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 1, nvinfer1::DataType::kFLOAT}, texec::kv_cache::CommState{std::vector{0}, 0}}})); ON_CALL(*this, getCounterpartsCount).WillByDefault(Return(1)); } @@ -217,7 +217,8 @@ TEST_F(MockTransceiverTest, MpiResponderBasic) auto sender = std::make_unique(); EXPECT_CALL(*sender, recvRequestInfo) .WillOnce(Return(RequestInfo{0, - texec::DataTransceiverState{texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, nvinfer1::DataType::kFLOAT}, + texec::DataTransceiverState{ + texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 1, nvinfer1::DataType::kFLOAT}, texec::kv_cache::CommState{std::vector{0}, 0}}})); EXPECT_CALL(*sender, sendSync).WillOnce(Return()); EXPECT_CALL(*sender, getCounterpartsCount).WillOnce(Return(1)); @@ -318,7 +319,7 @@ protected: dataType, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks, CacheType::kSELF, std::nullopt, nullptr, true); mCacheState = std::make_unique( - numLayers, numHeads, sizePerHead, tokensPerBlock, 1, 1, dataType); + numLayers, numHeads, sizePerHead, tokensPerBlock, 1, 1, 1, dataType); if (tensorrt_llm::common::getEnvUseUCXKvCache()) { @@ -506,7 +507,7 @@ TEST_F(SymmetricalCacheTest, SimpleTest) #if ENABLE_MULTI_DEVICE using AsymmetricTestParam - = std::tuple; + = std::tuple; class AsymmetricalCacheTest : public ::testing::TestWithParam { @@ -516,8 +517,8 @@ protected: void TearDown() override {} - void setUpCommunicator(int contextTp, int contextPp, int genTp, int genPp, bool isMLA = false, - bool contextDP = false, bool generationDP = false) + void setUpCommunicator(int contextTp, int contextPp, int contextCp, int genTp, int genPp, int genCp, + bool isMLA = false, bool contextDP = false, bool generationDP = false) { #if ENABLE_MULTI_DEVICE tensorrt_llm::mpi::initialize(tensorrt_llm::mpi::MpiThreadSupport::THREAD_MULTIPLE); @@ -572,11 +573,13 @@ protected: { mTpSize = contextTp; mPpSize = contextPp; + mCpSize = contextCp; } if (mIsGeneration) { mTpSize = genTp; mPpSize = genPp; + mCpSize = genCp; } mTpRank = mRankInInstance % mTpSize; @@ -585,6 +588,7 @@ protected: mGenRankSize = genRanks; mContextTpSize = contextTp; mContextPpSize = contextPp; + mContextCpSize = contextCp; EXPECT_EQ((sessionComm.getRank()), mRankInInstance); EXPECT_EQ(sessionComm.getSize(), mSizeInInstance); @@ -696,11 +700,12 @@ protected: texec::kv_cache::CacheState::AttentionType attentionType = isMLA ? texec::kv_cache::CacheState::AttentionType::kMLA : texec::kv_cache::CacheState::AttentionType::kDEFAULT; - mCacheState = std::make_unique(numLayers, numHeadsPerRank, sizePerHead, - tokensPerBlock, mTpSize, mPpSize, dataType, attentionType, kvFactor, enableDPAttention, DPrank, DPsize); + mCacheState + = std::make_unique(numLayers, numHeadsPerRank, sizePerHead, tokensPerBlock, + mTpSize, mPpSize, mCpSize, dataType, attentionType, kvFactor, enableDPAttention, DPrank, DPsize); mContextCacheState = std::make_unique(numLayers, numHeadsPerRankForContext, - sizePerHead, tokensPerBlock, mContextTpSize, mContextPpSize, dataType, attentionType, kvFactor, mContextDP, - DPrank, mContextTpSize); + sizePerHead, tokensPerBlock, mContextTpSize, mContextPpSize, mContextCpSize, dataType, attentionType, + kvFactor, mContextDP, DPrank, mContextTpSize); // UVM seems to be incompatible with MPI, and it is continuing to investigate. bool constexpr useUvm = false; @@ -859,7 +864,8 @@ protected: texec::kv_cache::CacheState cacheState{mContextCacheState->getModelConfig().mNbKvHeadsPerLayer, mContextCacheState->getModelConfig().mSizePerHead, mContextCacheState->getModelConfig().mTokensPerBlock, mContextCacheState->getParallelConfig().mTensorParallelism, - mContextCacheState->getParallelConfig().mPipelineParallelism, mContextCacheState->getDataType(), + mContextCacheState->getParallelConfig().mPipelineParallelism, + mContextCacheState->getParallelConfig().mContextParallelism, mContextCacheState->getDataType(), mContextCacheState->getAttentionConfig().mAttentionType, mContextCacheState->getAttentionConfig().mKvFactor, mContextCacheState->getParallelConfig().mEnableAttentionDP, contextDpRank, mContextCacheState->getParallelConfig().mTensorParallelism}; @@ -1094,8 +1100,8 @@ protected: tensorrt_llm::mpi::MpiComm const* mComm; tensorrt_llm::mpi::MpiComm mParticipatingComm{nullptr, false}; SizeType32 mWorldSize{0}, mRank{0}, mRankInInstance{0}; - SizeType32 mSizeInInstance{0}, mTpRank{0}, mPpRank{0}, mTpSize{0}, mPpSize{0}, mContextRankSize{0}, mGenRankSize{0}, - mContextTpSize{0}, mContextPpSize{0}; + SizeType32 mSizeInInstance{0}, mTpRank{0}, mPpRank{0}, mTpSize{0}, mPpSize{0}, mCpSize{0}, mContextRankSize{0}, + mGenRankSize{0}, mContextTpSize{0}, mContextPpSize{0}, mContextCpSize{0}; LlmRequest::RequestIdType mRequestId{0}; bool mContextDP{false}; bool mGenerationDP{false}; @@ -1129,22 +1135,24 @@ TEST_P(AsymmetricalCacheTest, TestCase) AsymmetricTestParam param = GetParam(); int contextTp = std::get<0>(param); int contextPp = std::get<1>(param); - int genTp = std::get<2>(param); - int genPp = std::get<3>(param); - int numLayers = std::get<4>(param); - int numHeads = std::get<5>(param); - int sizePerHead = std::get<6>(param); - int tokensPerBlock = std::get<7>(param); - nvinfer1::DataType dataType = std::get<8>(param); + int contextCp = std::get<2>(param); + int genTp = std::get<3>(param); + int genPp = std::get<4>(param); + int genCp = std::get<5>(param); + int numLayers = std::get<6>(param); + int numHeads = std::get<7>(param); + int sizePerHead = std::get<8>(param); + int tokensPerBlock = std::get<9>(param); + nvinfer1::DataType dataType = std::get<10>(param); - int kvFactor = std::get<9>(param); - bool isMLA = std::get<10>(param); - bool contextDP = std::get<11>(param); - bool generationDP = std::get<12>(param); + int kvFactor = std::get<11>(param); + bool isMLA = std::get<12>(param); + bool contextDP = std::get<13>(param); + bool generationDP = std::get<14>(param); - bool isWindow = std::get<13>(param); + bool isWindow = std::get<15>(param); - setUpCommunicator(contextTp, contextPp, genTp, genPp, isMLA, contextDP, generationDP); + setUpCommunicator(contextTp, contextPp, contextCp, genTp, genPp, genCp, isMLA, contextDP, generationDP); if (mIsContext || mIsGeneration) { @@ -1221,21 +1229,23 @@ TEST_P(AsymmetricalCacheTestWithDP, TestCase) AsymmetricTestParam param = GetParam(); int contextTp = std::get<0>(param); int contextPp = std::get<1>(param); - int genTp = std::get<2>(param); - int genPp = std::get<3>(param); - int numLayers = std::get<4>(param); - int numHeads = std::get<5>(param); - int sizePerHead = std::get<6>(param); - int tokensPerBlock = std::get<7>(param); - nvinfer1::DataType dataType = std::get<8>(param); + int contextCp = std::get<2>(param); + int genTp = std::get<3>(param); + int genPp = std::get<4>(param); + int genCp = std::get<5>(param); + int numLayers = std::get<6>(param); + int numHeads = std::get<7>(param); + int sizePerHead = std::get<8>(param); + int tokensPerBlock = std::get<9>(param); + nvinfer1::DataType dataType = std::get<10>(param); - int kvFactor = std::get<9>(param); - bool isMLA = std::get<10>(param); - bool contextDP = std::get<11>(param); - bool generationDP = std::get<12>(param); - bool isWindow = std::get<13>(param); + int kvFactor = std::get<11>(param); + bool isMLA = std::get<12>(param); + bool contextDP = std::get<13>(param); + bool generationDP = std::get<14>(param); + bool isWindow = std::get<15>(param); - setUpCommunicator(contextTp, contextPp, genTp, genPp, isMLA, contextDP, generationDP); + setUpCommunicator(contextTp, contextPp, contextCp, genTp, genPp, genCp, isMLA, contextDP, generationDP); if (mIsContext || mIsGeneration) { @@ -1324,95 +1334,95 @@ TEST_P(AsymmetricalCacheTestWithDP, TestCase) } INSTANTIATE_TEST_CASE_P(AsymmetricCaseTest0, AsymmetricalCacheTest, - testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1, 2), testing::Values(1, 2), - testing::Values(4), testing::Values(4), testing::Values(4), testing::Values(16), - testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2), + testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1), testing::Values(1, 2), + testing::Values(1, 2), testing::Values(1), testing::Values(4), testing::Values(4), testing::Values(4), + testing::Values(16), testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2), testing::Values(false), testing::Values(false), testing::Values(false), testing::Values(true, false))); INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithWindow, AsymmetricalCacheTest, - testing::Combine(testing::Values(1), testing::Values(1), testing::Values(1), testing::Values(1), testing::Values(5), - testing::Values(4), testing::Values(4), testing::Values(8), + testing::Combine(testing::Values(1), testing::Values(1), testing::Values(1), testing::Values(1), testing::Values(1), + testing::Values(1), testing::Values(5), testing::Values(4), testing::Values(4), testing::Values(8), testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2), testing::Values(false), testing::Values(false), testing::Values(false), testing::Values(true))); INSTANTIATE_TEST_CASE_P(AsymmetricCaseTest1, AsymmetricalCacheTest, - testing::Combine(testing::Values(4), testing::Values(1), testing::Values(1), testing::Values(4), testing::Values(8), - testing::Values(4), testing::Values(4), testing::Values(8), + testing::Combine(testing::Values(4), testing::Values(1), testing::Values(1), testing::Values(1), testing::Values(4), + testing::Values(1), testing::Values(8), testing::Values(4), testing::Values(4), testing::Values(8), testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2), testing::Values(false), testing::Values(false), testing::Values(false), testing::Values(false, true))); INSTANTIATE_TEST_CASE_P(AsymmetricCaseTest2, AsymmetricalCacheTest, - testing::Combine(testing::Values(1), testing::Values(2), testing::Values(1), testing::Values(1, 4), - testing::Values(16), testing::Values(16), testing::Values(4), testing::Values(8), - testing::Values(nvinfer1::DataType::kFLOAT), testing::Values(2), testing::Values(false), testing::Values(false), - testing::Values(false), testing::Values(false))); + testing::Combine(testing::Values(1), testing::Values(2), testing::Values(1), testing::Values(1), + testing::Values(1, 4), testing::Values(1), testing::Values(16), testing::Values(16), testing::Values(4), + testing::Values(8), testing::Values(nvinfer1::DataType::kFLOAT), testing::Values(2), testing::Values(false), + testing::Values(false), testing::Values(false), testing::Values(false))); INSTANTIATE_TEST_CASE_P(AsymmetricCaseTest0ForMLA, AsymmetricalCacheTest, - testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1, 2), testing::Values(1, 2), - testing::Values(4), testing::Values(1), testing::Values(4), testing::Values(16), - testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(1), + testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1), testing::Values(1, 2), + testing::Values(1, 2), testing::Values(1), testing::Values(4), testing::Values(1), testing::Values(4), + testing::Values(16), testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(1), testing::Values(true), testing::Values(false), testing::Values(false), testing::Values(false))); INSTANTIATE_TEST_CASE_P(AsymmetricCaseTest1ForMLA, AsymmetricalCacheTest, - testing::Combine(testing::Values(4), testing::Values(1), testing::Values(1), testing::Values(4), testing::Values(4), - testing::Values(1), testing::Values(4), testing::Values(8), + testing::Combine(testing::Values(4), testing::Values(1), testing::Values(1), testing::Values(1), testing::Values(4), + testing::Values(1), testing::Values(4), testing::Values(1), testing::Values(4), testing::Values(8), testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(1), testing::Values(true), testing::Values(false), testing::Values(false), testing::Values(false))); INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithDPForMLA1, AsymmetricalCacheTestWithDP, - testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1, 2), testing::Values(1, 2), - testing::Values(4), testing::Values(1), testing::Values(4), testing::Values(16), - testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(1), + testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1), testing::Values(1, 2), + testing::Values(1, 2), testing::Values(1), testing::Values(4), testing::Values(1), testing::Values(4), + testing::Values(16), testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(1), testing::Values(true), testing::Values(true), testing::Values(true), testing::Values(false))); INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithDPForMLA2, AsymmetricalCacheTestWithDP, - testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1, 2), testing::Values(1, 2), - testing::Values(4), testing::Values(1), testing::Values(4), testing::Values(16), - testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(1), + testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1), testing::Values(1, 2), + testing::Values(1, 2), testing::Values(1), testing::Values(4), testing::Values(1), testing::Values(4), + testing::Values(16), testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(1), testing::Values(true), testing::Values(true), testing::Values(false), testing::Values(false))); INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithDPForMLA3, AsymmetricalCacheTestWithDP, - testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1, 2), testing::Values(1, 2), - testing::Values(4), testing::Values(1), testing::Values(4), testing::Values(16), - testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(1), + testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1), testing::Values(1, 2), + testing::Values(1, 2), testing::Values(1), testing::Values(4), testing::Values(1), testing::Values(4), + testing::Values(16), testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(1), testing::Values(true), testing::Values(false), testing::Values(true), testing::Values(false))); INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithDPForNoMLA, AsymmetricalCacheTestWithDP, - testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1, 2), testing::Values(1, 2), - testing::Values(4), testing::Values(4), testing::Values(4), testing::Values(16), - testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2), + testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1), testing::Values(1, 2), + testing::Values(1, 2), testing::Values(1), testing::Values(4), testing::Values(4), testing::Values(4), + testing::Values(16), testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2), testing::Values(false), testing::Values(true), testing::Values(true), testing::Values(false))); INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithDPForNoMLA1, AsymmetricalCacheTestWithDP, - testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1, 2), testing::Values(1, 2), - testing::Values(4), testing::Values(4), testing::Values(4), testing::Values(16), - testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2), + testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1), testing::Values(1, 2), + testing::Values(1, 2), testing::Values(1), testing::Values(4), testing::Values(4), testing::Values(4), + testing::Values(16), testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2), testing::Values(false), testing::Values(true), testing::Values(false), testing::Values(false))); INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithDPForNoMLA2, AsymmetricalCacheTestWithDP, - testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1, 2), testing::Values(1, 2), - testing::Values(4), testing::Values(4), testing::Values(4), testing::Values(16), - testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2), + testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1), testing::Values(1, 2), + testing::Values(1, 2), testing::Values(1), testing::Values(4), testing::Values(4), testing::Values(4), + testing::Values(16), testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2), testing::Values(false), testing::Values(false), testing::Values(true), testing::Values(false))); INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithDPForNoMLADuplicate0, AsymmetricalCacheTestWithDP, - testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(4), testing::Values(1), - testing::Values(4), testing::Values(2), testing::Values(4), testing::Values(16), - testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2), + testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1), testing::Values(4), + testing::Values(1), testing::Values(1), testing::Values(4), testing::Values(2), testing::Values(4), + testing::Values(16), testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2), testing::Values(false), testing::Values(true, false), testing::Values(false), testing::Values(false))); INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithDPForNoMLADuplicate1, AsymmetricalCacheTestWithDP, - testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(2), testing::Values(2), - testing::Values(4), testing::Values(1), testing::Values(4), testing::Values(16), - testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2), + testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1), testing::Values(2), + testing::Values(2), testing::Values(1), testing::Values(4), testing::Values(1), testing::Values(4), + testing::Values(16), testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2), testing::Values(false), testing::Values(true, false), testing::Values(false), testing::Values(false))); INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithDPForNoMLADuplicate2, AsymmetricalCacheTestWithDP, - testing::Combine(testing::Values(4), testing::Values(1), testing::Values(4, 2), testing::Values(1), - testing::Values(4), testing::Values(2), testing::Values(4), testing::Values(16), - testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2), + testing::Combine(testing::Values(4), testing::Values(1), testing::Values(1), testing::Values(4, 2), + testing::Values(1), testing::Values(1), testing::Values(4), testing::Values(2), testing::Values(4), + testing::Values(16), testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2), testing::Values(false), testing::Values(false), testing::Values(false), testing::Values(false))); INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithDPForNoMLADuplicate4, AsymmetricalCacheTestWithDP, - testing::Combine(testing::Values(4), testing::Values(1), testing::Values(1, 2), testing::Values(2), - testing::Values(4), testing::Values(1, 2), testing::Values(4), testing::Values(16), - testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2), + testing::Combine(testing::Values(4), testing::Values(1), testing::Values(1), testing::Values(1, 2), + testing::Values(2), testing::Values(1), testing::Values(4), testing::Values(1, 2), testing::Values(4), + testing::Values(16), testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2), testing::Values(false), testing::Values(false), testing::Values(false), testing::Values(false))); #endif @@ -1430,8 +1440,10 @@ TEST(targetTest, CacheStateNODP) int contextPP = 2; int contextTP = 4; + int contextCP = 1; int genPP = 2; int genTP = 2; + int genCP = 1; bool const contextEnableDP = false; bool const genEnableDP = false; @@ -1441,10 +1453,10 @@ TEST(targetTest, CacheStateNODP) auto attentionType = isMLA ? texec::kv_cache::CacheState::AttentionType::kMLA : texec::kv_cache::CacheState::AttentionType::kDEFAULT; auto const contextCache = tensorrt_llm::executor::kv_cache::CacheState{numLayers, numHeads, sizePerHead, - tokensPerBlock, contextTP, contextPP, dataType, attentionType, kvFactor, contextEnableDP, 0, 0}; + tokensPerBlock, contextTP, contextPP, contextCP, dataType, attentionType, kvFactor, contextEnableDP, 0, 0}; auto const genCache = tensorrt_llm::executor::kv_cache::CacheState{numLayers, numHeads, sizePerHead, - tokensPerBlock, genTP, genPP, dataType, attentionType, kvFactor, genEnableDP, 0, 0}; + tokensPerBlock, genTP, genPP, genCP, dataType, attentionType, kvFactor, genEnableDP, 0, 0}; auto const contextTragetInfo = tensorrt_llm::executor::kv_cache::TargetRanksInfoForDP(genCache, contextCache, contextRank); @@ -1504,8 +1516,10 @@ TEST(targetTest, CacheStateContextDP) int contextPP = 1; int contextTP = 4; + int contextCP = 1; int genPP = 1; int genTP = 2; + int genCP = 1; bool contextEnableDP = true; bool genEnableDP = true; @@ -1519,10 +1533,11 @@ TEST(targetTest, CacheStateContextDP) auto const contextCache = tensorrt_llm::executor::kv_cache::CacheState{numLayers, numHeads, sizePerHead, tokensPerBlock, contextTP, - contextPP, dataType, attentionType, kvFactor, contextEnableDP, contextDPRank, contextTP}; + contextPP, contextCP, dataType, attentionType, kvFactor, contextEnableDP, contextDPRank, contextTP}; - auto const genCache = tensorrt_llm::executor::kv_cache::CacheState{numLayers, numHeads, sizePerHead, - tokensPerBlock, genTP, genPP, dataType, attentionType, kvFactor, genEnableDP, generationDPRank, genTP}; + auto const genCache + = tensorrt_llm::executor::kv_cache::CacheState{numLayers, numHeads, sizePerHead, tokensPerBlock, genTP, + genPP, genCP, dataType, attentionType, kvFactor, genEnableDP, generationDPRank, genTP}; auto const contextTragetInfo = tensorrt_llm::executor::kv_cache::TargetRanksInfoForDP(genCache, contextCache, contextRank); @@ -1625,10 +1640,11 @@ TEST(targetTest, CacheStateContextDP) auto const contextCache = tensorrt_llm::executor::kv_cache::CacheState{numLayers, numHeads, sizePerHead, tokensPerBlock, contextTP, - contextPP, dataType, attentionType, kvFactor, contextEnableDP, contextDPRank, contextTP}; + contextPP, contextCP, dataType, attentionType, kvFactor, contextEnableDP, contextDPRank, contextTP}; - auto const genCache = tensorrt_llm::executor::kv_cache::CacheState{numLayers, numHeads, sizePerHead, - tokensPerBlock, genTP, genPP, dataType, attentionType, kvFactor, genEnableDP, generationDPRank, genTP}; + auto const genCache + = tensorrt_llm::executor::kv_cache::CacheState{numLayers, numHeads, sizePerHead, tokensPerBlock, genTP, + genPP, genCP, dataType, attentionType, kvFactor, genEnableDP, generationDPRank, genTP}; auto const contextTragetInfo = tensorrt_llm::executor::kv_cache::TargetRanksInfoForDP(contextCache, genCache, generationRank); diff --git a/cpp/tests/unit_tests/executor/agentCommTest.cpp b/cpp/tests/unit_tests/executor/agentCommTest.cpp index 9c23f33f50..d9e6aaa138 100644 --- a/cpp/tests/unit_tests/executor/agentCommTest.cpp +++ b/cpp/tests/unit_tests/executor/agentCommTest.cpp @@ -90,7 +90,7 @@ protected: size_t maxNumTokens = 1024; mTransBufferManager = std::make_unique(mCacheManager.get(), maxNumTokens); - mCacheState = std::make_unique(numLayers, numHeads, sizePerHead, tokensPerBlock, 1, 1, dataType); + mCacheState = std::make_unique(numLayers, numHeads, sizePerHead, tokensPerBlock, 1, 1, 1, dataType); } void TearDown() override diff --git a/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp b/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp index 27fff8df7d..1dad1fa2bb 100644 --- a/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp +++ b/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp @@ -726,7 +726,7 @@ TEST(SerializeUtilsTest, ContextPhaseParams) { auto state = std::make_unique(); state->setCommState(texec::kv_cache::CommState{12, "127.0.0.1"}); - state->setCacheState(texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, nvinfer1::DataType::kFLOAT}); + state->setCacheState(texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 8, nvinfer1::DataType::kFLOAT}); auto stats = texec::ContextPhaseParams({10, 20, 30, 40, 50, 60}, 0, state.release(), VecTokens{10, 20}); auto stats2 = serializeDeserialize(stats); EXPECT_EQ(stats, stats2); From f49dafe0da0d4790145f620f649601290c69f5d6 Mon Sep 17 00:00:00 2001 From: Dimitrios Bariamis Date: Thu, 21 Aug 2025 18:08:38 +0200 Subject: [PATCH 12/33] [https://nvbugs/5394409][feat] Support Mistral Small 3.1 multimodal in Triton Backend (#6714) Signed-off-by: Dimitrios Bariamis <12195802+dbari@users.noreply.github.com> Signed-off-by: Dimitrios Bariamis Co-authored-by: Dimitrios Bariamis <12195802+dbari@users.noreply.github.com> Co-authored-by: Iman Tabrizian <10105175+Tabrizian@users.noreply.github.com> --- .gitattributes | 2 + .../defs/triton_server/build_engines.py | 72 ++++++ .../integration/defs/triton_server/common.py | 10 +- .../defs/triton_server/conftest.py | 13 + tests/integration/defs/triton_server/test.sh | 14 +- .../defs/triton_server/test_triton_llm.py | 196 +++++++++++++++ .../test_input_files/excel_table_test.jpg | Bin 20174 -> 130 bytes .../integration/test_input_files/merlion.png | 3 + ...els-franco-monsalvo-252430633-32285228.jpg | 3 + .../pexels-maxim-shklyaev-1511525-2914194.jpg | 3 + .../pexels-ron-lach-8975010.jpg | 3 + .../test_lists/test-db/l0_a100.yml | 8 + .../preprocessing/1/model.py | 199 +++++++++++++-- .../preprocessing/config.pbtxt | 13 +- .../tensorrt_llm/1/model.py | 4 +- .../tensorrt_llm/config.pbtxt | 2 +- .../tensorrt_llm_bls/1/lib/decode.py | 1 + .../tensorrt_llm_bls/1/lib/triton_decoder.py | 8 +- .../tensorrt_llm_bls/config.pbtxt | 9 +- .../multimodal/ensemble/config.pbtxt | 13 +- .../multimodal/multimodal_encoders/1/model.py | 231 +++++++++++++++--- .../multimodal_encoders/config.pbtxt | 7 +- .../multimodal/requirements-mistral3.1.txt | 1 + .../tests/test_llmapi_python_backend.py | 6 + .../all_models/tests/test_python_backend.py | 6 + .../all_models/tests/test_triton_decoder.py | 6 + triton_backend/ci/L0_backend_trtllm/test.sh | 1 + triton_backend/tools/multimodal/client.py | 67 ++++- 28 files changed, 821 insertions(+), 80 deletions(-) create mode 100644 tests/integration/test_input_files/merlion.png create mode 100644 tests/integration/test_input_files/pexels-franco-monsalvo-252430633-32285228.jpg create mode 100644 tests/integration/test_input_files/pexels-maxim-shklyaev-1511525-2914194.jpg create mode 100644 tests/integration/test_input_files/pexels-ron-lach-8975010.jpg create mode 100644 triton_backend/all_models/multimodal/requirements-mistral3.1.txt diff --git a/.gitattributes b/.gitattributes index e72ba0fe7b..7486041ffd 100644 --- a/.gitattributes +++ b/.gitattributes @@ -7,3 +7,5 @@ triton_backend/tools/gpt/input_data.json filter=lfs diff=lfs merge=lfs -text *cubin.cpp filter=lfs diff=lfs merge=lfs -text docs/source/blogs/media/tech_blog3_mla_absorb.png filter=lfs diff=lfs merge=lfs -text +tests/integration/test_input_files/*.png filter=lfs diff=lfs merge=lfs -text +tests/integration/test_input_files/*.jpg filter=lfs diff=lfs merge=lfs -text diff --git a/tests/integration/defs/triton_server/build_engines.py b/tests/integration/defs/triton_server/build_engines.py index b8298d7309..1155b47d4a 100755 --- a/tests/integration/defs/triton_server/build_engines.py +++ b/tests/integration/defs/triton_server/build_engines.py @@ -1763,3 +1763,75 @@ def prepare_rcca_nvbug_4714193_engine(tensorrt_llm_example_root, assert os.path.exists(engine_dir), f"{engine_dir} does not exists." return engine_dir + + +def prepare_mistral3_pixtral_engine(tensorrt_llm_multimodal_example_root, + tensorrt_llm_llama_example_root, + mistral_small_model_root): + # Convert Mistral3 from HF + model_base_name = os.path.basename(mistral_small_model_root.rstrip("/")) + ckpt_dir = os.path.join(tensorrt_llm_multimodal_example_root, "model_dir", + model_base_name) + convert_cmd = [ + "python3", + f"{tensorrt_llm_llama_example_root}/convert_checkpoint.py", + "--dtype=bfloat16", + f"--model_dir={mistral_small_model_root}", + f"--output_dir={ckpt_dir}", + ] + + # Build Mistral3 LLM engine + engine_dir = os.path.join(tensorrt_llm_multimodal_example_root, + "engine_dir", model_base_name) + + build_cmd = [ + "trtllm-build", + f"--checkpoint_dir={ckpt_dir}", + "--max_batch_size=4", + "--max_input_len=8192", + "--max_seq_len=8192", + # Allow an arbitrary number of image tokens by setting: + # max_multimodal_len = max_batch_size * max_input_len + "--max_multimodal_len=32768", + "--use_paged_context_fmha=enable", + f"--output_dir={engine_dir}", + ] + + # Build Pixtral visual encoder engine + multimodal_engine_dir = os.path.join(tensorrt_llm_multimodal_example_root, + "tmp", "trt_engines", model_base_name, + "multimodal_encoder") + build_visual_engine_cmd = [ + "python3", + "build_multimodal_engine.py", + "--model_type=pixtral", + f"--model_path={mistral_small_model_root}", + f"--output_dir={multimodal_engine_dir}", + "--max_batch_size=2", + ] + + append_timing_cache_args(build_cmd) + convert_cmd = " ".join(convert_cmd) + build_cmd = " ".join(build_cmd) + build_visual_engine_cmd = " ".join(build_visual_engine_cmd) + if not os.path.exists(engine_dir) or not os.path.exists( + multimodal_engine_dir): + check_call(install_requirement_cmd, + shell=True, + cwd=tensorrt_llm_llama_example_root) + check_call(convert_cmd, shell=True) + check_call(build_cmd, shell=True) + check_call(build_visual_engine_cmd, + shell=True, + cwd=tensorrt_llm_multimodal_example_root) + + else: + print_info(f"Reusing engine: {engine_dir}") + print_info(f"Skipped: {convert_cmd}") + print_info(f"Skipped: {build_cmd}") + print_info(f"Skipped: {build_visual_engine_cmd}") + + assert os.path.exists(engine_dir), f"{engine_dir} does not exists." + assert os.path.exists( + multimodal_engine_dir), f"{multimodal_engine_dir} does not exists." + return engine_dir, multimodal_engine_dir diff --git a/tests/integration/defs/triton_server/common.py b/tests/integration/defs/triton_server/common.py index 174c1a7f58..fb41bdc00c 100644 --- a/tests/integration/defs/triton_server/common.py +++ b/tests/integration/defs/triton_server/common.py @@ -247,7 +247,8 @@ def modify_ib_config_pbtxt(REPO_PATH, CROSS_KV_CACHE_FRACTION="", ENCODER_INPUT_FEATURES_DTYPE="TYPE_FP16", GUIDED_DECODING_BACKEND="", - XGRAMMAR_TOKENIZER_INFO_PATH=""): + XGRAMMAR_TOKENIZER_INFO_PATH="", + PROMPT_EMBEDDING_TABLE_DTYPE="TYPE_FP16"): fill_template_py = os.path.join(llm_backend_repo_root, "tools", "fill_template.py") tensorrt_llm_config = os.path.join(llm_backend_repo_root, REPO_PATH, @@ -274,6 +275,7 @@ def modify_ib_config_pbtxt(REPO_PATH, check_call( f"python3 {fill_template_py} -i {multimodal_enc_config} triton_max_batch_size:{TRITON_MAX_BATCH_SIZE}," \ f"multimodal_model_path:{MULTIMODAL_ENGINE_PATH},encoder_input_features_data_type:{ENCODER_INPUT_FEATURES_DTYPE}," \ + f"prompt_embedding_table_data_type:{PROMPT_EMBEDDING_TABLE_DTYPE}," \ f"hf_model_path:{TOKENIZER_PATH}", shell=True) check_call( @@ -305,6 +307,7 @@ def modify_ib_config_pbtxt(REPO_PATH, f"lookahead_ngram_size:{EXECUTOR_LOOKAHEAD_NGRAM}," \ f"lookahead_verification_set_size:{EXECUTOR_LOOKAHEAD_VERIFICATION_SET}," \ f"encoder_input_features_data_type:{ENCODER_INPUT_FEATURES_DTYPE}," \ + f"prompt_embedding_table_data_type:{PROMPT_EMBEDDING_TABLE_DTYPE}," \ f"participant_ids:{PARTICIPANT_IDS_DRAFT}," \ f"logits_datatype:TYPE_FP32'", shell=True) @@ -329,6 +332,7 @@ def modify_ib_config_pbtxt(REPO_PATH, f"lookahead_ngram_size:{EXECUTOR_LOOKAHEAD_NGRAM}," \ f"lookahead_verification_set_size:{EXECUTOR_LOOKAHEAD_VERIFICATION_SET}," \ f"encoder_input_features_data_type:{ENCODER_INPUT_FEATURES_DTYPE}," \ + f"prompt_embedding_table_data_type:{PROMPT_EMBEDDING_TABLE_DTYPE}," \ f"participant_ids:{PARTICIPANT_IDS_TARGET}," \ f"logits_datatype:TYPE_FP32'", shell=True) @@ -348,7 +352,8 @@ def modify_ib_config_pbtxt(REPO_PATH, check_call( f"python3 {fill_template_py} -i {tensorrt_llm_bls_config} triton_max_batch_size:{TRITON_MAX_BATCH_SIZE}," \ f"decoupled_mode:{DECOUPLED_MODE},accumulate_tokens:{ACCUMULATE_TOKEN},bls_instance_count:{BLS_INSTANCE_COUNT}," \ - f"tensorrt_llm_model_name:{TENSORRT_LLM_TARGET_MODEL_NAME},tensorrt_llm_draft_model_name:{TENSORRT_LLM_DRAFT_MODEL_NAME},logits_datatype:TYPE_FP32", + f"tensorrt_llm_model_name:{TENSORRT_LLM_TARGET_MODEL_NAME},tensorrt_llm_draft_model_name:{TENSORRT_LLM_DRAFT_MODEL_NAME},logits_datatype:TYPE_FP32," \ + f"prompt_embedding_table_data_type:{PROMPT_EMBEDDING_TABLE_DTYPE}", shell=True) check_call( @@ -363,6 +368,7 @@ def modify_ib_config_pbtxt(REPO_PATH, f"gpu_weights_percent:{GPU_WEIGHTS_PERCENT},encoder_engine_dir:{ENCODER_ENGINE_PATH},max_queue_size:{MAX_QUEUE_SIZE}," \ f"enable_context_fmha_fp32_acc:{ENABLE_CONTEXT_FMHA_FP32_ACC}," \ f"encoder_input_features_data_type:{ENCODER_INPUT_FEATURES_DTYPE}," \ + f"prompt_embedding_table_data_type:{PROMPT_EMBEDDING_TABLE_DTYPE}," \ f"participant_ids:{PARTICIPANT_IDS}," \ f"logits_datatype:TYPE_FP32,guided_decoding_backend:{GUIDED_DECODING_BACKEND},tokenizer_dir:{TOKENIZER_PATH},xgrammar_tokenizer_info_path:{XGRAMMAR_TOKENIZER_INFO_PATH}'", shell=True) diff --git a/tests/integration/defs/triton_server/conftest.py b/tests/integration/defs/triton_server/conftest.py index 2afebbee14..d66bc0f09d 100644 --- a/tests/integration/defs/triton_server/conftest.py +++ b/tests/integration/defs/triton_server/conftest.py @@ -564,6 +564,19 @@ def tiny_llama_model_root(): return tiny_llama_model_root +@pytest.fixture(scope="session") +def mistral_small_3_1_24b_model_root(): + models_root = llm_models_root() + assert models_root, "Did you set LLM_MODELS_ROOT?" + model_root = os.path.join(models_root, + "Mistral-Small-3.1-24B-Instruct-2503") + + assert os.path.exists( + model_root + ), f"{model_root} does not exist under NFS LLM_MODELS_ROOT dir" + return model_root + + # Returns an array of total memory for each available device @pytest.fixture(scope="session") def total_gpu_memory_mib(): diff --git a/tests/integration/defs/triton_server/test.sh b/tests/integration/defs/triton_server/test.sh index e0819738ef..7782ecfe97 100755 --- a/tests/integration/defs/triton_server/test.sh +++ b/tests/integration/defs/triton_server/test.sh @@ -163,6 +163,7 @@ print_test_params () { echo "DECODING_MODE: ${DECODING_MODE}" echo "MAX_QUEUE_SIZE: ${MAX_QUEUE_SIZE}" echo "ENABLE_CONTEXT_FMHA_FP32_ACC: ${ENABLE_CONTEXT_FMHA_FP32_ACC}" + echo "PROMPT_EMBEDDING_TABLE_DTYPE: ${PROMPT_EMBEDDING_TABLE_DTYPE}" echo "run_all_tests: ${run_all_tests}" echo "----------------------------------" } @@ -180,26 +181,26 @@ fill_triton_repo () { fi echo "Filling triton repository at ${TRITON_REPO}/tensorrt_llm with engine ${DECODER_ENGINE_PATH}" - python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm/config.pbtxt triton_backend:${BACKEND},engine_dir:${DECODER_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE},max_queue_size:${MAX_QUEUE_SIZE},enable_context_fmha_fp32_acc:${ENABLE_CONTEXT_FMHA_FP32_ACC},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},logits_datatype:TYPE_FP32,lookahead_window_size:${LOOKAHEAD_WINDOW_SIZE},lookahead_ngram_size:${LOOKAHEAD_NGRAM_SIZE},lookahead_verification_set_size:${LOOKAHEAD_VERIFICATION_SET_SIZE} + python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm/config.pbtxt triton_backend:${BACKEND},engine_dir:${DECODER_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE},max_queue_size:${MAX_QUEUE_SIZE},enable_context_fmha_fp32_acc:${ENABLE_CONTEXT_FMHA_FP32_ACC},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},prompt_embedding_table_data_type:${PROMPT_EMBEDDING_TABLE_DTYPE},logits_datatype:TYPE_FP32,lookahead_window_size:${LOOKAHEAD_WINDOW_SIZE},lookahead_ngram_size:${LOOKAHEAD_NGRAM_SIZE},lookahead_verification_set_size:${LOOKAHEAD_VERIFICATION_SET_SIZE} python3 tools/fill_template.py -i ${TRITON_REPO}/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},preprocessing_instance_count:${PREPROCESSING_INSTANCE_COUNT} python3 tools/fill_template.py -i ${TRITON_REPO}/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},postprocessing_instance_count:${POSTPROCESSING_INSTANCE_COUNT} python3 tools/fill_template.py -i ${TRITON_REPO}/ensemble/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},logits_datatype:TYPE_FP32 if [ "${DRAFT_ENGINE_PATH}" != "" ] && [ "${DRAFT_ENGINE_PATH}" != "skip" ] && [ "${TARGET_ENGINE_PATH}" != "" ] && [ "${TARGET_ENGINE_PATH}" != "skip" ]; then - python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},accumulate_tokens:${ACCUMULATE_TOKEN},bls_instance_count:${BLS_INSTANCE_COUNT},tensorrt_llm_model_name:${TENSORRT_LLM_TARGET_MODEL_NAME},logits_datatype:TYPE_FP32,tensorrt_llm_draft_model_name:${TENSORRT_LLM_DRAFT_MODEL_NAME} + python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},accumulate_tokens:${ACCUMULATE_TOKEN},bls_instance_count:${BLS_INSTANCE_COUNT},tensorrt_llm_model_name:${TENSORRT_LLM_TARGET_MODEL_NAME},logits_datatype:TYPE_FP32,tensorrt_llm_draft_model_name:${TENSORRT_LLM_DRAFT_MODEL_NAME},prompt_embedding_table_data_type:${PROMPT_EMBEDDING_TABLE_DTYPE} else - python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},accumulate_tokens:${ACCUMULATE_TOKEN},bls_instance_count:${BLS_INSTANCE_COUNT},tensorrt_llm_model_name:${TENSORRT_LLM_MODEL_NAME},logits_datatype:TYPE_FP32,tensorrt_llm_draft_model_name:"" + python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},accumulate_tokens:${ACCUMULATE_TOKEN},bls_instance_count:${BLS_INSTANCE_COUNT},tensorrt_llm_model_name:${TENSORRT_LLM_MODEL_NAME},logits_datatype:TYPE_FP32,tensorrt_llm_draft_model_name:"",prompt_embedding_table_data_type:${PROMPT_EMBEDDING_TABLE_DTYPE} fi if [ "${DRAFT_ENGINE_PATH}" != "" ] && [ "${DRAFT_ENGINE_PATH}" != "skip" ]; then echo "Filling triton repository at ${TRITON_REPO}/tensorrt_llm_draft with engine ${DRAFT_ENGINE_PATH}" - python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_draft/config.pbtxt triton_backend:${BACKEND},engine_dir:${DRAFT_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE},max_queue_size:${MAX_QUEUE_SIZE},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},logits_datatype:TYPE_FP32 + python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_draft/config.pbtxt triton_backend:${BACKEND},engine_dir:${DRAFT_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE},max_queue_size:${MAX_QUEUE_SIZE},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},prompt_embedding_table_data_type:${PROMPT_EMBEDDING_TABLE_DTYPE},logits_datatype:TYPE_FP32 fi if [ "${TARGET_ENGINE_PATH}" != "" ] && [ "${TARGET_ENGINE_PATH}" != "skip" ]; then echo "Filling triton repository at ${TRITON_REPO}/tensorrt_llm_target with engine ${TARGET_ENGINE_PATH}" - python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_target/config.pbtxt triton_backend:${BACKEND},engine_dir:${TARGET_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:true,normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE},max_queue_size:${MAX_QUEUE_SIZE},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},logits_datatype:TYPE_FP32 + python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_target/config.pbtxt triton_backend:${BACKEND},engine_dir:${TARGET_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:true,normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE},max_queue_size:${MAX_QUEUE_SIZE},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},prompt_embedding_table_data_type:${PROMPT_EMBEDDING_TABLE_DTYPE},logits_datatype:TYPE_FP32 fi @@ -217,7 +218,7 @@ fill_triton_repo () { cp all_models/multimodal/multimodal_encoders ${TRITON_REPO} -r python3 tools/fill_template.py -i ${TRITON_REPO}/ensemble/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},logits_datatype:TYPE_FP32 python3 tools/fill_template.py -i ${TRITON_REPO}/preprocessing/config.pbtxt multimodal_model_path:${MULTIMODAL_ENGINE_PATH},engine_dir:${DECODER_ENGINE_PATH} - python3 tools/fill_template.py -i ${TRITON_REPO}/multimodal_encoders/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},multimodal_model_path:${MULTIMODAL_ENGINE_PATH},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},hf_model_path:${TOKENIZER_PATH} + python3 tools/fill_template.py -i ${TRITON_REPO}/multimodal_encoders/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},multimodal_model_path:${MULTIMODAL_ENGINE_PATH},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},prompt_embedding_table_data_type:${PROMPT_EMBEDDING_TABLE_DTYPE},hf_model_path:${TOKENIZER_PATH} python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_bls/config.pbtxt multimodal_encoders_name:multimodal_encoders fi @@ -649,6 +650,7 @@ TRITON_METRICS_PORT="8002" GPU_DEVICE_IDS="" DECODING_MODE="top_k_top_p" MAX_QUEUE_SIZE="0" +PROMPT_EMBEDDING_TABLE_DTYPE="TYPE_FP16" if [ "$MODEL" = "gpt-ib" ] || [ "$MODEL" = "mistral-ib" ] || [ "$MODEL" = "mistral-ib-mm" ]; then diff --git a/tests/integration/defs/triton_server/test_triton_llm.py b/tests/integration/defs/triton_server/test_triton_llm.py index d6f4be2b05..02755da458 100644 --- a/tests/integration/defs/triton_server/test_triton_llm.py +++ b/tests/integration/defs/triton_server/test_triton_llm.py @@ -1,4 +1,5 @@ import os +import re import sys import pytest @@ -3893,3 +3894,198 @@ def test_tiny_llama_ifb_token_counts( print_info( f"Successfully tested token count functionality for {TOKEN_COUNT_TEST} mode" ) + + +@pytest.mark.skip_less_device_memory(80000) +@pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble", "tensorrt_llm_bls"]) +@pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"]) +@pytest.mark.parametrize("BLS_INSTANCE_COUNT", ["1"]) +@pytest.mark.parametrize("PREPROCESSING_INSTANCE_COUNT", ["1"]) +@pytest.mark.parametrize("POSTPROCESSING_INSTANCE_COUNT", ["1"]) +@pytest.mark.parametrize("MAX_TOKENS_IN_KV_CACHE", [""]) +@pytest.mark.parametrize("MAX_ATTENTION_WINDOW_SIZE", [""]) +@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY", + ["max_utilization", "guaranteed_no_evict"]) +@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", ["0.7"]) +@pytest.mark.parametrize("CROSS_KV_CACHE_FRACTION", [""]) +@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"], + ids=["disableTrtOverlap"]) +@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"]) +@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"], + ids=["enableDecoupleMode", "disableDecoupleMode"]) +@pytest.mark.parametrize("TRITON_MAX_BATCH_SIZE", ["1"]) +@pytest.mark.parametrize("MAX_QUEUE_DELAY_MICROSECONDS", ["0"]) +@pytest.mark.parametrize("ENABLE_KV_CACHE_REUSE", ["False"]) +@pytest.mark.parametrize("NORMALIZE_LOG_PROBS", ["True"]) +@pytest.mark.parametrize("ENABLE_CHUNKED_CONTEXT", ["False"]) +@pytest.mark.parametrize("GPU_DEVICE_IDS", [""]) +@pytest.mark.parametrize("DECODING_MODE", [""]) +@pytest.mark.parametrize("MAX_BEAM_WIDTH", ["1"]) +@pytest.mark.parametrize("EXCLUDE_INPUT_IN_OUTPUT", ["False"]) +@pytest.mark.parametrize("PROMPT_EMBEDDING_TABLE_DTYPE", + ["TYPE_BF16"]) # allow override later +@pytest.mark.parametrize("ENCODER_INPUT_FEATURES_DTYPE", + ["TYPE_FP16"]) # pixtral uses fp16 vision by default +def test_mistral_small_3_1_24b_pixtral( + E2E_MODEL_NAME, + MAX_TOKENS_IN_KV_CACHE, + MAX_ATTENTION_WINDOW_SIZE, + BATCH_SCHEDULER_POLICY, + KV_CACHE_FREE_GPU_MEM_FRACTION, + CROSS_KV_CACHE_FRACTION, + ENABLE_TRT_OVERLAP, + BATCHING_STRATEGY, + DECOUPLED_MODE, + TRITON_MAX_BATCH_SIZE, + MAX_QUEUE_DELAY_MICROSECONDS, + MAX_BEAM_WIDTH, + ENABLE_KV_CACHE_REUSE, + NORMALIZE_LOG_PROBS, + ENABLE_CHUNKED_CONTEXT, + GPU_DEVICE_IDS, + DECODING_MODE, + PREPROCESSING_INSTANCE_COUNT, + POSTPROCESSING_INSTANCE_COUNT, + ACCUMULATE_TOKEN, + BLS_INSTANCE_COUNT, + EXCLUDE_INPUT_IN_OUTPUT, + PROMPT_EMBEDDING_TABLE_DTYPE, + ENCODER_INPUT_FEATURES_DTYPE, + tensorrt_llm_multimodal_example_root, + tensorrt_llm_llama_example_root, + mistral_small_3_1_24b_model_root, + llm_backend_multimodal_example_root, + llm_backend_venv, + llm_root, +): + if BATCHING_STRATEGY == "V1" and BATCH_SCHEDULER_POLICY == "max_utilization": + pytest.skip("Skipping. V1 doesn't support max_utilization.") + + llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"] + + # Build Engines (LLM + vision) + ENGINE_PATH, MULTIMODAL_ENGINE_DIR = prepare_mistral3_pixtral_engine( + tensorrt_llm_multimodal_example_root, tensorrt_llm_llama_example_root, + mistral_small_3_1_24b_model_root) + + # Prepare model repo + new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo") + prepare_ib_model_repo(llm_backend_repo_root, new_model_repo) + + # Prepare multimodal specific repo + prepare_multimodal_model_repo(llm_backend_repo_root, new_model_repo, + "ensemble") + prepare_multimodal_model_repo(llm_backend_repo_root, new_model_repo, + "multimodal_encoders") + + # Modify config.pbtxt + TOKENIZER_PATH = mistral_small_3_1_24b_model_root + modify_ib_config_pbtxt( + new_model_repo, + ENGINE_PATH, + TOKENIZER_PATH, + llm_backend_repo_root, + DECOUPLED_MODE, + MAX_TOKENS_IN_KV_CACHE, + MAX_ATTENTION_WINDOW_SIZE, + BATCH_SCHEDULER_POLICY, + BATCHING_STRATEGY, + KV_CACHE_FREE_GPU_MEM_FRACTION, + EXCLUDE_INPUT_IN_OUTPUT, + ENABLE_TRT_OVERLAP, + TRITON_MAX_BATCH_SIZE, + MAX_QUEUE_DELAY_MICROSECONDS, + MAX_BEAM_WIDTH, + ENABLE_KV_CACHE_REUSE, + NORMALIZE_LOG_PROBS, + ENABLE_CHUNKED_CONTEXT, + GPU_DEVICE_IDS, + DECODING_MODE, + PREPROCESSING_INSTANCE_COUNT, + POSTPROCESSING_INSTANCE_COUNT, + ACCUMULATE_TOKEN, + BLS_INSTANCE_COUNT, + MULTIMODAL_ENGINE_PATH=MULTIMODAL_ENGINE_DIR, + ENCODER_INPUT_FEATURES_DTYPE=ENCODER_INPUT_FEATURES_DTYPE, + PROMPT_EMBEDDING_TABLE_DTYPE=PROMPT_EMBEDDING_TABLE_DTYPE, + ) + + # Launch Triton Server + launch_server_py = os.path.join(llm_backend_repo_root, "scripts", + "launch_triton_server.py") + check_call( + f"PMIX_MCA_gds=hash python3 {launch_server_py} --world_size=1 --model_repo={new_model_repo}", + shell=True) + check_server_ready() + + image_merlion = os.path.join( + llm_root, + "tests/integration/test_input_files/merlion.png", + ) + image_football = os.path.join( + llm_root, + "tests/integration/test_input_files/pexels-franco-monsalvo-252430633-32285228.jpg", + ) + image_hockey = os.path.join( + llm_root, + "tests/integration/test_input_files/pexels-ron-lach-8975010.jpg", + ) + image_basketball = os.path.join( + llm_root, + "tests/integration/test_input_files/pexels-maxim-shklyaev-1511525-2914194.jpg", + ) + + test_cases = [ + { + "text": "What is the capital of England?", + "image": "", + "match": re.compile("london", re.IGNORECASE) + }, + { + "text": "In as few words as possible, what city is this?", + "image": image_merlion, + "match": re.compile("singapore", re.IGNORECASE) + }, + { + "text": + "In as few words as possible, what sports are depicted in the images?", + "image": + ",".join([image_football, image_hockey]), + "match": + re.compile("(football|soccer).*hockey", re.IGNORECASE | re.DOTALL) + }, + { + "text": + "In as few words as possible, what sports are depicted in the images?", + "image": + ",".join([image_football, image_hockey, image_basketball]), + "match": + re.compile("(football|soccer).*hockey.*basket", + re.IGNORECASE | re.DOTALL) + }, + ] + + for test_case in test_cases: + TEXT = test_case["text"] + IMAGE = test_case["image"] + MATCH = test_case["match"] + + # Run Test: use multimodal client; set model_type to pixtral + run_cmd = [ + f"{llm_backend_multimodal_example_root}/client.py", + "--model_type=pixtral", + f"--text={TEXT}", + f"--image={IMAGE}", + "--request-output-len=128", + "--end-id=2", + ] + if DECOUPLED_MODE == "True": + run_cmd += ["--streaming"] + + if E2E_MODEL_NAME == "tensorrt_llm_bls": + run_cmd += ["--use_bls"] + + output = venv_check_output(llm_backend_venv, run_cmd) + + assert MATCH.search( + output), f"Test failed for input: {TEXT=}, {IMAGE=}, {output=}" diff --git a/tests/integration/test_input_files/excel_table_test.jpg b/tests/integration/test_input_files/excel_table_test.jpg index 5c07c5e61e2cc98062eed109e376c57bddfa1626..f81e6b7bdc705e6518cd20a37daacda579881d68 100644 GIT binary patch literal 130 zcmWN{OA>=13;@u*r{DsWk0wEH1B4W2+Ts*+(bLz}+q^4(du<<8#yI+}^|Ox0OZt3U z&N!#`*8RvyUv_GhRWHE-?j9T&p;bKN7QL9{L3|Ftkx0bZ1S~!o0SL4$E^$EQJF+Ym O15UmEvU0}KI{g6y0VqZQ literal 20174 zcmV)BK*PUMNk&G-P5=N`MM6+kP&iDvP5=NeD#1k%*9R+-ZP&`IeI7aIU;qL5*Mou= z&)I8zVgZsQNsg4rs$I0!+&JrCU#A<2sn}mK_UQJ5@gs&=s*BVPykts zC}JeGX{c5}P!Rxwl>{N25)YdgdnpO5p&z0oZDLBh*zekYuGnT>qGoNl7A!s>l({ zmdzG5zmDLz{jsfWr|pkzw)MnXe|pR|yY2set!dluPir;1owg5ci(7~=#AeHgBr6zJ z)1i$0;-u)Y^Sx=e2-9eRK@uC9J>|8(?+csHQ^!8DKRS}1GyOMwKgaE(NyA3?e#P^R zU0+Lh_i%II@d7s|)79BDo z-;YcEf9UyQ-}}qG`;ptXP1m~(Ll^{IoRylG#^JYI-FUo zKm~_9PLDQy*I$z6)SSDkZ8^PZ+n%-2v~3wf_^`n)u)xMKjLz0SU|mdqU&T2}aM3C`-o5APLdI zyaReh&(3M)Y}4^*n%6O97_;%|#YDV=>2L;e7@lWUbxF8q&Q<^azv>^+e@KiZIZ~=` zo10Yq_B#KZKbF}4S;)3uBv+x%qUBT8T2x3_yGwTM zEwnfXdyzFM-2#v&03}tqwxi=*ZeAN8_z}gb)BJGQh4)Wl-}04e@>(fTL={=vN%BrW z65wrSW5~GqYyL(0mApR48w->S`YV{(sElOpX)L}kZ0F$q^LRkh}nb7QFAuyOqB+H@pdp>#9` z#u%u?sG~p?0Mr1+cOjx&$q@iRoj-j%tVgE40=ugwa6?#}suqQ0(I=|YsS@%7k9vJt zNv~>-bV+nvMXQC&QLtflM8AsEqB~-)h^YWzpD0kO1fX6qKKOaHgS?q??bD+;HDU*| zyHo+k@xGF(KG?)83sZRUH6mt~ANo@Y)XNpg^#`nYIMydJ=EwDag8w`;bZt@sy5hiF@?t|_yfeks_utE2g$ zPh~AmsaED@H7KH?tq{Nrv|>$fo)-cef*wGs4nAqeGmA4Pv?*R+-gC4k)E~~AZnJ*i z=y>1zS=m`>OY#JYKUG*C*rA+%Xbor&Clr>t&dme^qxnH}kg*sO^ z!`Ls)dfbH-!2#0dndP z!CF=8jnd&DW?tX>+sXVJFVAy*uBvyrVk;-86>WOPEO;ZylO3}4MJE#@0Hu!i?kHF0u+i*7xtd!HUZK%*$cUABYoaw6;~^=I^VI^_;Es=0E6=yar3P8F{ua#@_y+FFZkrIfYt za&uEktU^xR$or*;h)9*XE-K4FfujRcH)XcKRGXsZo$bDJZhX2{$yQNIZxLjk2v!Xk@ zbA1~E+-0_+(z-&UIP1q)iPP9Y#}2Ikpd!&Uf{0B$R&5bcC8kPSTGm=+Zz>5Cc4}w$?A_NtaoD;6_8Uy7W!EIPc0YjLk5|fb ze4v@hr^TNtM^WMCHk=cy>7rgKF11VEk9zN%?8L4ZTI$f3j6qdx>wEw!@72lpM^M*s6Awh3E0j4M{q%`P_B|Hr+0iL%R)tGt ziV)o2o~utD+FdCK%Z}@Fl~NXCPENo7>e8P8<{b>BzvqkJ@wXSR%J)CekCaX(CgM~r zJV3y=#21J?Q;t+AuAKdt&cHGN=uQHDE~K?yA%#aLdI0Sx>J`msfA!m+`5lyM5f8}r z0<-f#3p2A%s^TFe;?kQE@@3y_h88^^#Z<2d&R+W?F6)Y-Dr$mI0wP*5n9zZWP|A`B zBBEf9WOkChUG2U_-wGOZM*)S7kB{Vao7O~n_X6;m`_icH0%jz<0?uff3Q4Gd9tkGR zOX$@U1?SDlDQ?cIZ~G0~E0V~%)r`KjiFSU!f~y>9xav5C&ZQ_w^fQNPR37o3S99d= z4I&zPBZzB=-OT6{_K9i>@*Q}tfoN3Q+VW~kw$Ux?-&i-~cgjGslKX{`@@CcrEB3lg zm{wO3P7cGMR2e{RV$Pn}UV#!t%A?<}ECWB%e^XOsmzFj(DdD1Wt{+I!UJ9;a_wJ;j zn4?hWLbTXW?dwR=7gfpUwZ=NGo&R!xFPzli7!!_#lpJif5)!IXw5BRpoTEG=QGh*3 zfQA720`#g)G9eC_nZDWxd&r4X<7PA`>br{IrQ8I8_`g^9U#d z)PzFDO^k5dfw{8gj`qr`+BT5sfcP-PvIaC_<-biIGcUYzD-PU4A*dCT#iQqm71W*n z`5~&M0=Vx3*mz%h>-nDsF22=RUE_N{41mK`LCU`PnfIgMOTP*H=-wdS`P}F5*^7$F zZS0o8J;_SH*aRlo-SMp`#DeZtiXZOdcKLTqW{t*E-J__QqV8Meof{nW_fCqTd1&z{~@uvV_f7BKHMgTTEcvkXmSKr0_ zMuP#6VPP)58@#I&yohYmfi67^)`dsH>%Y*QnAd;#WF-88Vw+@SHT`VFh0*7Vi{CH; zCx;2sN}m=J^)YF%vK$BS6crHjo!~LOCuZ$2SxY{phfPkO*oq5zBtcd6mYg?F-YEhv z^U&RmZ-BQNbbg z_N#wde=7cJOUPC)e-sOkPw4EOd?r`*R{K)2Gfm3C;d-cV@!;Udb!Km@o;NqoEjo|RTB9h|#aF~7^w5~Kehr}{9F=~E@z?l)~E3h^ez#Pp|C zjA`rU)gG~!6I5dXpoV_c3z(om09imbk_LhoG(M|w zRhc8Z)iH++f6D|tlrx`q>eXuzeC51_dR2@Ob@SM2xib>(gUy=Xpplx{S zXQ3--bQBvGA}eY;g5kK58s8BY!s^H;5WZW9;O|#YfP1Nr>DK%AkdHQQ(VN4r zc78rSG4uy+bt@iblf51*Z$Z0bckS)apvbqquAf7+HT3+e=oArCx}u5EI1zyCgnen} ziFW*X@RvmVlqnf%L6?Jk2FwA4x-yUuplQfR;R2)w6HnLUJ!eG(Rz}jpupEVJn^(Da z9YG?>iKGz9uW?9yC!Sgu7<$UNJuM12Y_`&A${nep;Te#15%dlsCL8c+Ho0t9i3|-* z0xvqTV~y{n(y0V)Jvvw-&21{uCCF!3M46D_EaB*BUlr-OYo_lEh>F5=FNZzzml0&j zYE|o|#J;s8O@PX2b zHNz*-GCjp4Zf=(&oM43s>jPdZ{xHpby*oAh(ohG-H|HmNsb2$(t} zniQK=F+fTP-_u`z|FWX_R%jo7&l610rD^)*5m}0tUVh~@!xse+QILgM-nI%}&GWp- zwLW~lx+2Jn3MADlO09j(RTakhh0goMlZs2LRskL*;>@mfRVLC=<)?&4MUxeX0tzKy z&b6!Z?D?eY2*^iXPb47Q6IiUy6OXlhrZrHC>{hR9Un}e`z@+nwHT#pkuId%)@kD`t zv7)bew|QISKxAY@zR`x%N55N&!a?^(X{EKTu9gZx211yFx5GS(Lf6&NjZ#trj}kE3 z>h7@coGvsX3OmHtT^XDeG+^|`6=u{H9L@!2@6uO|Cgcoo8#(FjHD6vk;DoJo*gksM zz!A*dST?@yik;uCM4Ap&9cJZo2<;e&8&wW0Lx7VG0k(xF5jx1}ZoiN?S6MRz1M~^U zwX_sI>CCcTK&aEg4@MP&H*p^u=Wy83SI(Y_a+?|Qde?=h8GvT|!JXeNnQrGCxJ68G zkrQqjrS39D&qR++2%cRbLN%glQr%|s%uFMGDlI+}z)hciL?nRMQrwtwP&T9AA&M2C zh5o}iR_V4&P8ctIMPm#6bfDk^d5X;5WKv>_>LH)`lWF;%*-5)N!YJA#-5LKLzp z4FEtTmN1vzXb)Kdcq{`+wMNns~0Ip*rb^a#KHC*;KqKbk=E2^%50$A>usU6()%~BW*hBcOZOaTCA z8M!%BK7#;`Ecg{dO|VxvO3Ure0Z+B*i1F3iL5BX^IO~l{3ki$C?xx}*rMx5B{X8>s z*}13eGKsDyZE;8^B}Y@Rl3$C}Mc8f6a~!pcZ~`tLx>abIE4-r@R(it(4!~Owy4Ohu zphMScTWJE6ZNqJI&~;CMD!D*#17QkCj&-k8A4JQ}k#f*BaFje2XD!(4v+EPXiSweG z%!`gtdeyMh(+;r8G%J{@#|XPL*=194Pa)Na*diMlGqvi)+|KU@Hh>|#K#97Ax^7wo zdCL(D)5&#(A>|kvp8AX7_|b^xWlN1FChAa{+w+s`U@nWv1OCS zQe{!fw8J9n?eHgjPvbJsbuP{3R4cXWq=pGInn`0SP#3C;J3kJM_(f?5#85dk1c~qHP*VThzD0B6}X1>Jjyt8!BnP6 zjtD%1-(`(DaINuJd<(M#7VctsgnhR=mWl*koKJ(y^*U_zv!yU068M~K02KBwuw8(+ zU~>#{lnpfzs-8pe=E=@^w8mxgYm7hf(A3yQ0A zg-@#ehfqNpg}H~N?hYh{F_&x=mWK$yk%aA3LWjYM4aaqGb%z;+V6S5-IBrM-p0I-N zsBT`M?7^lTjx1t2=p{c10 z-r9;OE0Y>U_*H3zAP0-y{*Op|M8ri)C0^Nqq&}$T^dnQ={DA}G1w$nZV`rU6+CbjO zlnh-8x#>TRiXjDIJ;U81*|ZC>x(MPe{-g|AVDaKBfj-GFqN*emIuK7uirVd+v&p{R zZy0>p)#zwBH@ELX;){mKN~cKbTfV4~XG+vlkL~33(YUt%xjnyRJqIMJa zW$xIj*Rb?m)`r_vu>Ap^9}1d)jVsVnFS9-yo}WXb>|2SAo)(7dilLWZnKMywu*r#} zC)(4HgC+a=gNK0TaO#=}$WwmY+6;}rVU~=IwdflYiNNGfU+%exfVG?j07wfi%8A*y zfjk)-+|1Otc}_MII2uMk5N-MvBPh@Km}kPl!=eg$YVEq%Tg>%p5=fH4ieVx<(V|izjIat}et402G76?pK+lrsY*oG1x7Rt3o=`>K zO&JC|BXaBD*0l>_6fZ-F>PMR@5iUv-{s8S~2_9<7FihVRyBZ7UVq_=q$GNeR)gT)o z)P@_50}MA$csn!0oovwIW`tsjfi2w zX1W#0zS2h45^9KOsDTFoC=b z6vDq@V|-9*z;BW98O zZ~-GR%apW)C8c!n0UUK(aK4G&OZD9|N9yrqz%&SeTADXIi}1Vi@ZvRFRCXoSJ=qkRJ-xX?UaGq^3CqMDD6v+fMpB#oG+P$9sG z_5$UOf@@KM_$c*_T&>FAN_y4tUskA(wR5iU{cJ(I27Ap()JA)X90N(G@gGbS^%SlX zs_*`NqJWC2okg%eo@uMGby<;Mz(%-+%T)}0h-`@BPI*j6oU^N|4IDwu#EU85O~+)` z^zq10NAHoUIu=VSfkEw+>0>@ANiNY!6Hbv5 z4n$(vV(3b{?Ph?XG4kq5tVEj7KV{6h>n9}1Eb7-hy08rUfe3Nk-JJcmyItV~gRyet z23qyJrk)1XPk%U9Xe(l}EQhy9Ii11!K2U18ZC8rvAaj@HhOKeSw(mqJo+oa%w$T>R zzUuf1KRG=lam;GXTHzoK=6f3|8@y|Yar6!ov z)}ammEb0up8q3W3xkthTpqN}~Ob{N_n$y!Faz}Ymi4~rMS%Ls2=EDPD5|*MgmYzUY zUj94(nklGv$}~p&p_K}jGx$V9Y(zu{1mT4zR5xpZ_AHsmMU9?FL!b zWLnqf>GxGHPli_BjOhP{U-NIGX`Ut``US51<9EN#%;z&-(`>)t=RZFi=DsR3VcOek z_N-X4kuR^cLvDLtGy5t%={m2O=$Q(5R;2_Gwe|$0m@hx|ffTf6Kfl1teYOZ8ABN9w z;V2<9w0B@bb5I!(%SkOj*N>CND_8Qh3DmoPDm5#*Zh2W zpp+pjIsspn8EcaCk~gc*`Sjmxx6GV=20)SQ4?zo{!wBDCDu=o9GW18xIsX@0)E z32SH3B8Su&T7s!~(vg?9j2{$II|<%|K3Mn=5ucm1&aV%L^Pb?~si_vmcNWO~xb^a= zUxxta=!(>!T3yp*VpgWC|kK8rM08eV>Ttn7@p zB~uwE^^O5W2ZL`AdFplZs%zSxc#qaFP((CI zg7xX;OQx)-_bSkr#LSR6HZKqLp*l?lA1v6!ld3lr-omB{hy;sC4S9LNjPJqnNw=z&bmjZ__426^&8lmbN@7Ma8oN{uObNQ#prh?h zq#Sv9*Vn0o7-}e1kvoV~JBD}}$VwJ>8u1<1EQy%DPii_&|S@AP~n8k4%u`-@-UY@pZRntK#=*#CmxpE2=Goi7>@Eq-U)- ze#{u>qAMHoL1uk&rN|yi1>Zt$h^EW)95Sc zgyk4?=M*azRdx`aUBJ&ElcE_4T2T(Hgd|j}Y zPZdC(3M-yB?x{nHS{KSdgjl`r&54({a01b0Vss}Hi={t>D zFYjqh#*BP;9R%eT4b|4spUc~61oMf2q) zxq&?tFF(iSE`2`le3K1oNWeECJWMdal@m0(Bby${i|0H8s={8xc^`HCSM zScUs7lrJA3HaN&cATbCppUReiVQr3=hoLn6y7KaQ<>g7t80phd%u`c~6F3{%7BT z(;tC&`FY5(5jZR!iFFGvA8ES``n)qckQTOH{(Ikrb+UlGyd|IZ&gVE`-^1*+{`*krmi7qwzUR*j@aI#LFY>Lxl%Wx`{(dFOe@_GX!a1 zt!+VBa}{>-&7YSifFF#adhkYcMZ63W#q;txAvxOy4_%AG%cBtic=EWxVv3`)f}8)H zX`D`^3Mhy4kduC>w8Z7d%Xe9(nCN;89w1^nddmq5ynde$Q|n6n0mAU|dzXZ(Lska^ z4(g@i-5$%cr2-i^Mal)^2Sz4RzPx<(-8d|E2vm8-)v}xsEeLe1mLwwM_f7`0V7&Ym zx}|kPa7~B`i{#6XFzIUTAyiZp6!sGq<;csAi8U1XRFh{aAraPV9 zN5+iUpM~P(EhkzIr@0B9v&h2gve6ZkV~JP)UyMt>ygW=)N6$8T4LHsj@;FgEDTwcN zP;~Pk;dpsaH5>&ksFwO}Poy9Tj{)ZwR61RG`Na61XnsNUL+=svl;Lg)!UPm*vrrBd zLhK)gHz#MW!M>X8scmRMAO=#p77mVc z#s~OVk^xI%AS0c304Aby_PFHcTWSS@$o6?}7KoR}ETe#$3ix`>HCiHHo@nYOvc*NF zVDwLCULHv&3fkqcoYQ&tR5>V4Y0|WI@CN~a|IY2kJXNEB>joIg;C3gk)C7|n_op0G zb#NNM9?HBU1rQVL+`;oqC{i1Ku^GBdsKRlyqezq;U8tw?-V^$HEb23qHMpLEfBf*L z_C_#X-pa6D(LE_IkgQ}d0oIu4Bs8(4Ni1E9tlP8YT`1KW=)89U{CS?hNHk6JWTZc= z&U^mux@Csp_tMkDokPuw$r!9I(Kp5Hw`UtTrxGPFh^+RnevdmwR0InXnaD6yLc(0P{( zy@VYQjB#c{=bd@mNt;)j9Lo)KD|7^4KYA)Xn?_x3wI7-R@1dL zctT@N(seRR5ZT_Ma^>ashIeW34~NSPP)KS<=UoP|e!9<$kiKAM9;05K=y@On+7Mx* z7K=>TIQTDs<_ENWv#RCE%SVtyWW)eNMGegEED}o{ubwHZdIB2Jc=_NbU3MHbCL8J) zZiTT(acsIN!`>JAzU7S0`@uU34%3Fx3})O)pz}Te$NC-du8@&UJYK#V*Ol2gfm5RJnR9-M02NVpViv;8LW&^FV84PQZjTb#kk_7 ziowB3z|VqO$9(j7d7p1VDY+4fn1bq8N9TQjb@04K*;6GVFP}<m`*zHlX=W0r2rytXky zEk+h}?^?>)EoLo|AycPl)WywAHNPYGgfy!>%_9O%6FP%|?aO?JmTYM=L$-WVX% z%`oy0g;l>l^J3O zwf0J^dS6~X8%#!2!E$o1^C{mrqw~(fCBZ_4-6|q8V^p1YzyqTn#Ck>@bhp=dvB)_G zS~xqGVC@H`eO;q-;N>Us2#9eQ!c6q5M`6I1L9*boRN?ByVw67b!H9zK=%|E5=Q~-s zOtv5pC9oE{0&6Q`?CGiVE?d#^zzc{m!3m{}IiYf!>32lLq+{6UJsdBeRyR&UDAC&E z{OSts)|tqA%t&YxWRqtnUfvF*5=}oEtltfw3j2`8#d#aVL|U2TWA=I9Rv4) z!vc$1KLDr`BMTlx*22?wNAvPCd9*O>Y72MWXg#|otSpV5TfI)$I0(^*ecs_%Wz_^^ z$9&Shz{*}(#r{^!=oV3u|(0svLXwUA!(L+8@bJa0J1stGCV1o?d&R8%B9Z11+3-Ne) z+2N@Sc$Ua)9_3(dVQrDc0a9sdJiVS~n=x7rpjU>TlsF}cFM|5LQLBrc_4~XBB?fqOk?$V7ajwz zV?7i0Se7Vzga76T*ILe6_=08ucYE>ju{{gZnH-FtQX2mme|2@uyzfzG?lb{|~ACUM=f z*eC|nc~6AWL({(v;1YCLXvNS!Y%zv@zA#QdUfxm{JMgDuTy`Y|9adePcRTyGtwfc) zSVNv1tIxYa1(U<6Gk9Juai8}+0_{qJYb(tO$A!+j@$x1k`ykgADB28~eVupX@xa{{ ztAo6pk`kSBf_aQS?}FG|e=>=P!Tse9Ga9q&Ya*q{uLYc3#k2sYgKk!o5jt<^Xm&Or zabiMMyXs;D`0cjmsD0i+kcM>)$QEfYEjOBu;00+zH-nu+(Fi?Jq~HfZ+)Va$-t7c4 z{GI{kX)?~`kDxm5fO751HT+>P(z>cboS!eBea!@XI`0==foDPj(wCQ)&RyXPWb%1M z?G^4P|9p}Iop<9374$xXO?LvBCnJqsFHa|)2XbnbxzD=^9FQI!&!XIUc`3k6C!w)J zO=I;DM>_9BL~sP(07GVV*fxQAc{FjOxt6U3@AH0u8X1oeC1+m#L@akyqobJMhx$72 zWJUE`4vCNpX)tcRydj}G5fK3FUgSRSZg^=%Z5?^}K_0=A0xP4_I1_d(j&$D5-VSaA zxlKbYZ9KCZwO-y%=%@YR5k&4B=nIO4=W{dsxW5*vcK;KWtq@0024yxUW_ za8Y_Nl+9z;%R36I(yP=pA#>6DybCawYGP^y;N|6pC~ye^cx2ghoCKhepLgSo5pMS9<%0kO`Ebs%^o`@}lEjhDJ9V@=Qdyv-_spL_ zFQ2%*7C2LS!TY>h3UW#rgO_&{2}B$dR1nk_f&VjPvyKC~L>|9h9%*ldH7!Bg`IA4E zTAr{^59Ej;q9X9}NC3>QtZoD_l6zsVn9!oBgZCKl>jdz5^zw;<1BG$^jD~SOpK|!z zf{2yQyu5LHA`_G&%(-tUOq0gHg~3tWZT?C{n|}hmd>&$C6r6aW`@Dy1jaw+unsoE zBC|5F4TJO&eDKUll0BVw8;Fg~GugXAWM1ALLoN0Qh$@u%`SKQ6d#{XRu-vO6@bW1+ z=m=gQqGkbyx;pQ6mKCJ{%(`=^jiy3$Uf!pHJ~s%IQ+`i`-vAVWm-l2CSoIp{yb~c2 z4~bCy$bH_YfNJE+FDPkfop+ytarwx&+KD`UCZBf%6z}uCG{0WwIF6UUp=-=i@22Z& zcO*!T-RHeI{}rH}FOV-EASvC%9qRgIGKzE(Q{H%8?bGxbeBQH-VWGcI2Ed=D814FG zPAQt`$K%#^IxP3pWO8Y{*YX5i1@^L`4&68Z9$r~SmbJ{h&vq)wdU>uN^?lt6XQ{lwRKFJ$OR z6gU-bfUJm?MK20mQ3kp`8B>WaW!{FD;W%0Bge=&%HnDZy$Kf!-6k}R<6;0~- zLic$$jB(KcvNe>1RiUK(czKTk3Ss05f}*O0hlM%TPUdJf+MX;8zK_`F9XD>l;|JUi zBE|v>5DJMatb0PCGxlT3*0%x`uv(LK|;6vDeI&tT5 zkrjoCJatn_JuYZ zF3dw%gz7k);VzkRO#!Y?#+g7)!)Vdqe>kkn6pv}7P|{6>XuNy@OG*rp8!Wfby=V*+ zGPpxNAb~G!k21*h$r$}i&Y(*hjAv}t|3k#kg8K=^%PZwt2v|z!oGr2d^%?rGTj3Fl6D`$;uy=C=w)A?(oMcesXa5{m@glvWV%|}<6^4dmH3-u<~{}rTQnRiY6C^!4)5}o#-15n zfl)N8n?iNY?QykRPu^nkk!l6n+~D&r+mYdgOjwyG^X0P|fx<%-)NAC++tJGtSHKrm z-F{Ln_-tV61h`;p?3tkP2zZ{~)=W=WcGM#ZHR6KK*cLinQ7G6Or`!XmyxfwqB=42K=diTk`u z;^G0DH9b?gF@K?uZqs>B6H*F-ou6w*JJmckwRgZT0vmhWfe-#d z$IjppMEmR6ZCw@u%z)p)tsGcB?e}?~W@{|%(@X#`|1nQArStxq0chU*d)2k(+JK?D zT!j!?`R@~;Kbu<}XI{ozX`ku32_Wv+)D$Pv6CW3ZD`380fE`64r85&i$f3Tq*(3 z%p+bt_LeE>72BBl2SXs=E?6MWTBSm`jNcH0vJD6bUPdXG_lc zGM&Yu1%Ucnb#ZIS`h^qtTK@sJ8P{NJMNUhu;Pn~jI>$P&U}e&DCg@BELq-x6l{DLoqAd3W!pU0zh_ z=d$E??*|h0R_arEmAM7dc?b36?)vT5dOUQ&<9zClQgwCnmJY=oWb(-v{v$t>-m|d0 z{QW#l?`;%MYx0{~&Bc-!=Z}6(22d}DpCsSgSzq{P=q_HB2OEX!<9p9LkhMwx35558 z4)N0d`1ion8&*!>-o}Y)W5{49L5Ei3`#Si7PbEdVI^}9#@`#s@zfJJ+GU)l;@PDJA z`FQ_uGZ5k2;}?t(Z^)bbJ4~&G!1?P2MhUZ?1~+*Bok~bpw66G(eDkoPcun{>9^L%? z;zN1+FzZ{&<6+t6%BgC$b>7BzUsg)PWgp$c0|0_}%`o7;{WOzDDgWW*@_(a`B z%~C(fY@y3j;x59Q)z|wq);RF(hjIy%9v##V-%|76P%j~Us&mEFAimOjKV4Y=gV5XKh;~%v}?laH{$0H z@6|eY?DN6Lqv%_qkb+CkX&>YK@1c{Q5;?+IzbU%MjWoO1qSR~frL(S}$t+U8@o^<0 zvD6Vq8EW`Fmf0*EfRC~Ca2Kdd->)tXvyE6Z(4&{X-@^O*H^+CS488dSY?l49!rAG~ zKle77EPxS0#7>@D5aL$=?|KkU2>T8H-?-tilrZKT-`;^o!2G59`J0iIAOWCB)O!}J z^?jN>>5xgSHb&u|cWaYB4(D(-D`Al=hDM}(hpm8jx-3_SX?!$%`J`|Dy+DDuH|j8O z4Atan_^+7gBMvKjxN~zc*8F$4!`qscIsU@ZH=2@)~r2eM^e4;J z0(1HYg{nar4dMrH$gBI{+BV^0+N*)@9&QpyJ)N7*=ly7C-dyBL%;sX=2E=9uNMd>U z*meLf?*-=um&%u4nuZa67p7JkUfwse?R@#q>j9&L zE~MCV*%%Qb^Ue_?GS&lL{tl*|mX{9@Ld=){%t{jpfDKq{xa(-~p$sQ+QT-_zf7zes)tATqbZG&rvI_*-^(d3XQgecnH-$J1FZ<%{X* zXSvmJmz59QcE^L`Y7f5kc=-^kla2ele@2hIGX8`QD#l^0?cO5`P;Xvm-{-Z4~ALD5k2>-KMSYoA$83f1%!iRZt zY$qxa@iw3L@H;FoA7Oj;*=f>BfjclUJ~rSUJp90j)lc^lA5e}^`?qfWE7>J{Mv{JZ z1=5he=m_EPlc9Yu>c$@G*?-vgw`>1nfs;OUXnf+(Kb>n>i(@gVLaBicdMtLL;twopcyk}vd253zrK&`Ru6;>fcr-SeLXNZ zAZmw0{B-!yz}}(Z2dv(@f8uHEeQ@~DAQFvzZP<4izwg?4M{3=zG~)s?0Go}rNLpB#U3@Q@y!=-q!{ zltS2hczDp!*uj1X(})(TYga%4k`8V0C11R#A^txqhOwuozeb1v5#)3tM3Ok9*Wm=s z^lLUkePQLFe!ynfJazvu(cuUCd(B}UHlc&44}=GGJN10`vQo)7%0o?n5>@T#XbqrGlg#z8~2+VSA4=SK1K6%q4c&! z#3T`QNs>U9$s|CRMIj+bi4XuvDJ%(XO1fbh;PR=nAD{k|^_dlB(5KI8{*~6Oe*;=u z0M@|FD*)eromoC#US4%K293jSpCmMnlu|bD%f2eLt{LvzuFN&pRqB~}-tn(~S$kgr z?2C0mB7Luzd!A?J#d&%~Y<+Dc*snd@=N&Kuzn-gBBM_bURjx>~ubC_GlKh}AeXK8i zR^S)vOMuIW_=!@&mx%D8QU!kbd3O2B&oZUxYeabUS(X)u0L4eTg79JTs{nFj;pNQB zR70Y9qF0EVa-Mrhv?F)J(KJ)^%z2`9!rQxj&oJ;kb&3@Rc{mm4QtEisICxkFS7Nvd z%Xn3Ja<-tA35Xq`22?~YDN=H_ynQQH90UlEtYe^dhUh8fsD!Q0ov?%!MKeQ@JF26K~aibG8ogIvceu#jOI)^txTiH zj4TZsM5clr!v{ngM?F$TNXilK*$pBBV0~W{3%MoM`sKj4z1(dz{hsR}x5zJCe3K&L zR=kHk1&sX5i|L(uyc*d~npXN+Et=4oYQ0QE#u^c;7Dnq-i=FA_26wHmbqCiKAG1s* zMKMU&)wlx|X8u4vezXH5?e~g>2os$i^*edt@R_U#tYK7(jNf*%GQ8EGr4AgsL}259 zFw`3bBpX61p=l#WLK~M#*_((>?(ABjvbzDI_AF+X3n$BTD=#cWSDB3V)UE)C*{Q1u zSIX^NFSb$u)}F{ZTXweK(3Q|#bE$OP?p*z(TlAIRd3f2bHtz7{s6STpi5d+5ns|P= z06(+Tax?kyyMb?OM%lv-0^1WdT{xUK6YMeYrW)%DAA#DBhxa>3%r5Z7PvYbKi=8py z>tpyD-@RY|9GI`Z9^3yeX@FD4A5fi+I$3sbZ=>Pp!RGQ&E*KE7T~{d5E=H=z?r#kG zWozs1sJ+^-oAE#KX>pFW5?ssbT>836Ws6Ws>V!sg*%Kv`Eu}D9$XC;?bY^AsmagC3 z@UZ#3FNc(bT4S}Ni>()q-%`RW`CD*Cv+kbydmvvQi%7`kpHlonBL$e!keEHxV|P!k zFOTHx{|rAWe18e>N0#tCf=?vB2RDC--@E0_{S7FU{@Upc_#-}V)~20Nc7TWTHSTOJ zj&jN$Zd&dxOJZD1s!+%9a!@+G$VY1StRKZsj6X!v0!z{I@l2(aC;(uJrmK{xQiM55 zF{C;`N{gTsmc?SH@loJi`0!4ptF8GG)IYmqOYY&iB6&IdVRvOKex|k}!#8wDxh6#9 zgr8@rRF7xQBiHNSDpnR?i0NHBq*@nH(O>Zo9T!*SlCP^KKaelJ-tQj$RX$W~&`O#A zZhY0W>kHiR{H6bn)mDFb{1W-A!#lp$P{Tn$qmG5mrS-*Rv$ir_RsCYN(TI&J59-{t zg3C+M_)Mix?T-GeLAeUFYVSYt>iK3)8?*htnJt>JI@dWv+#PdezXTyE;E zkV0aGSfFmXLXpbN<|C`OT=8efP5ti8zbZD8Kbe61g8CLp^*QVWzT00te$iMS576aD zxsB?v8U|jw!<(#BTc0i$kEV6IzcYKzW8>SsS7H`ctU$d2xg>yTdWg|T4W^xA05--{ zXX9CvV9~AKU4IX(@W@5!5V+7V&q`l@AjqQN+YpQN3;!4eTEE?V^H&jha}O=EFH9^L z?k5~p1~L?V8ZP&*_`^9DxbP?+S97o01xva+ga#!~qKNRkH*87m}Q2b!word~ak~P$qQp)i1f?|!{KR+(@4+bHY9-2>k{hLh33hN8#D zS1-S}H*fh^4~N%B)Yn6+FGDgjOiaBRzh}yD{h$hq8b?RBo{m>t?i7PUVY|a@uj^Y` zHhex?!LZ2Z^}T=JF|K&x#^J-o?Lu}gSG>;2YH3>prj_J+teoCyZpKrh z+q>}PI({y5^+oTa#z*QCrSER|UR}{Hb{MeQzWLr8vplFeeXPFq4gZozJmL^yn{vYH zvqncG<$L^Rr|pO9+9CRd|JKW)8<+{mZ3OA2x_SBaM&rHV<4*dM8jZ=Sp}CDAbtoZ8 zI9XZ)hnippAXuh||sq%Fk519o~gK0G?sGcog29Qb5ambN)g zRJfkcC9WkS%}tk%wyHD@+Q`+*wL;tx(I?%vy{cx;hFr#9u=fBXu6h`32n8`T6bB~> zoeTs724LxS%{|AEMrH*tqA5;qe!dU-lNrAZ#vhL;zaBpJFXi424KP~5jP0y(xbprf zc=?x&{@{JrXbsJ+iskUhC7M7YdDydCYI|{`3jLjH;}zu+v-_s&6(dC!IlO*wn?>UK z<^KAuo3+?*m5Rc^tk`X?h?Kp)9Z&4Ah|D#UF4v>5nG`<_ow06CJ6i6(^(Q{Z{kRe2 z_q_K5@2*O>-z~FQE&k>yw)BTNjP~*LCK*&;|LD zO1d zFSCP)Gpn0>e5A}byZK;ay4-SVMc{2`&sA(#dk+wk&q}uQGg=f1@l+{V+tg`FR;pSY z%KvkvWx){%2p6A|i{1-?d+^*dumPfHn-AeVIW(31xKZe)gR=@d8VEtJcvSG}?sC5? zVI@Nl>!*t0usBkz4*ZW5{)ZIg?gtir@`z0GKLMGsjR4eHAHm^?$_0yxme|LGh(Tt6E z_Wl;6BT=M0b9}y%E6mk1ksAST`jv$n9@>RzTccLB5=-XeB&svsJIWaUrt*5W&)_2B z9gSzf1O6V`hb?rt4%b6HPS(!oNB#akB+Jb==vbuhA-TWFYm&4^1z5 za!Q|W_@*M8u&!q?(~+{RcYFjNaGzg%BzbuxYq`~n!NsT}Fus0YzON492jv(h^smc5 z$uBRzk(%)disl#9w`2_`qqewBxWCb`%eM}TL=@#D*kN3n)V&RCp<2Ivcn7Lwu_3}T zh_qj^FV{+i#I+)xn9*?}v3Qdumq%;KCICtR=)~TWd0n{xq|S`UtXTgn`zQVx>Tlyr zxYmMVM>tE&>DLc(!^^$oWt28k6voOEsPcD{y9v#}b3==PiG+YaK4=60)NUX>z6Jt8 z3bZaEkQ=J_ih4I>W_Wq&)#`lq z^}Zfz@4af_yZYIxBf_ zCV%^Z9ur?}Y(G)FpdOz$_MY-i{plCxH39PEx+jb{a{f$8FhZYyVg9%kY80LI&Ikd^ zME9hVVu(!s=HqN~Mj!Vg?%6XZSfn^*<(!zeRhfn}*5l&TX>S{V45q@FXVIUricZd3 zMJQGZ4VTH^d{8p*2@&z90UUtZ0ZfDQmKJHjJE)CB_|yq23@CDbh)n)2Knl>^r9|yP z3>FpY>-926nUAMH1a+C4}9- z&L1BllfMUF)dkmciSuN7=z4y^c<&8_z+wIr41*7w_g#2cSM6AU!=4`^6Z!ka6Pn{X No16RpFyKG`DFAuCOTGXA diff --git a/tests/integration/test_input_files/merlion.png b/tests/integration/test_input_files/merlion.png new file mode 100644 index 0000000000..c8d299377a --- /dev/null +++ b/tests/integration/test_input_files/merlion.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1f3b6a507ec92e8f47ac6d7c64e11b03fcba8c550bcb6851f80e261e8951431 +size 1604159 diff --git a/tests/integration/test_input_files/pexels-franco-monsalvo-252430633-32285228.jpg b/tests/integration/test_input_files/pexels-franco-monsalvo-252430633-32285228.jpg new file mode 100644 index 0000000000..ae27b79375 --- /dev/null +++ b/tests/integration/test_input_files/pexels-franco-monsalvo-252430633-32285228.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bd1efd0c8fe48b421210cd132dc3b3b2902ccf1523bb9bec3a3883bb5c7a650 +size 116299 diff --git a/tests/integration/test_input_files/pexels-maxim-shklyaev-1511525-2914194.jpg b/tests/integration/test_input_files/pexels-maxim-shklyaev-1511525-2914194.jpg new file mode 100644 index 0000000000..238fd51d10 --- /dev/null +++ b/tests/integration/test_input_files/pexels-maxim-shklyaev-1511525-2914194.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd922b837bc92353d49a60df1dd933eddfe7546e2b16b365acaadb9b2a0a683b +size 72231 diff --git a/tests/integration/test_input_files/pexels-ron-lach-8975010.jpg b/tests/integration/test_input_files/pexels-ron-lach-8975010.jpg new file mode 100644 index 0000000000..07ed42b2df --- /dev/null +++ b/tests/integration/test_input_files/pexels-ron-lach-8975010.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31c6fedadcb79990687d00d24350f774f4ad319439c89ed67d47c1df35a556fb +size 83652 diff --git a/tests/integration/test_lists/test-db/l0_a100.yml b/tests/integration/test_lists/test-db/l0_a100.yml index b8a846ccff..7a36bca755 100644 --- a/tests/integration/test_lists/test-db/l0_a100.yml +++ b/tests/integration/test_lists/test-db/l0_a100.yml @@ -99,3 +99,11 @@ l0_a100: - triton_server/test_triton.py::test_eagle[eagle] - triton_server/test_triton.py::test_llava_onevision[llava_onevision] - triton_server/test_triton.py::test_qwen2_vl[qwen2_vl] + - triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-max_utilization---1-1-1-False-ensemble] + - triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-max_utilization---1-1-1-False-tensorrt_llm_bls] + - triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-guaranteed_no_evict---1-1-1-False-ensemble] + - triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls] + - triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-max_utilization---1-1-1-False-ensemble] + - triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-max_utilization---1-1-1-False-tensorrt_llm_bls] + - triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-guaranteed_no_evict---1-1-1-False-ensemble] + - triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls] diff --git a/triton_backend/all_models/inflight_batcher_llm/preprocessing/1/model.py b/triton_backend/all_models/inflight_batcher_llm/preprocessing/1/model.py index 5a7d36ac1e..549f3f210d 100755 --- a/triton_backend/all_models/inflight_batcher_llm/preprocessing/1/model.py +++ b/triton_backend/all_models/inflight_batcher_llm/preprocessing/1/model.py @@ -29,13 +29,13 @@ import io import json import os from collections import defaultdict -from typing import List +from typing import Dict, List, Tuple import numpy as np import requests import triton_python_backend_utils as pb_utils from PIL import Image -from transformers import AutoProcessor, AutoTokenizer, T5Tokenizer +from transformers import AutoConfig, AutoProcessor, AutoTokenizer, T5Tokenizer class TritonPythonModel: @@ -136,9 +136,9 @@ class TritonPythonModel: 'model_type'] assert self.model_type in [ - 'llava', 'blip2-opt', 'vila', 'mllama', 'llava_onevision', - 'qwen2_vl' - ], f"[TensorRT-LLM][ERROR] Currently supported multi-modal models are llava, blip2-opt, vila, mllama, llava_onevision and qwen2_vl. Got {self.model_type}." + 'llava', 'blip2-opt', 'pixtral', 'vila', 'mllama', + 'llava_onevision', 'qwen2_vl' + ], f"[TensorRT-LLM][ERROR] Currently supported multi-modal models are llava, blip2-opt, pixtral, vila, mllama, llava_onevision and qwen2_vl. Got {self.model_type}." assert self.model_type != 'llava_onevison' or self.max_num_images is None or self.max_num_images <= 1, f"LLaVA-OneVsion is not support multi image inference currently." @@ -151,10 +151,18 @@ class TritonPythonModel: llm_model_config["pretrained_config"]["vocab_size"]) self._setup_ptable_shape(llm_model_config) - if self.model_type in ['mllama', 'llava_onevision', 'qwen2_vl']: + if self.model_type in [ + 'mllama', 'llava_onevision', 'qwen2_vl', 'pixtral' + ]: + full_processor = AutoProcessor.from_pretrained( + tokenizer_dir, trust_remote_code=True) + self.hf_config = AutoConfig.from_pretrained(tokenizer_dir) self.vision_preprocessor = VisionPreProcessor( self.model_type, - AutoProcessor.from_pretrained(tokenizer_dir), model_config) + full_processor, + model_config, + self.hf_config, + ) # Parse model output configs and convert Triton types to numpy types output_names = [ @@ -285,7 +293,9 @@ class TritonPythonModel: request, 'VIDEO_BYTES') vision_processed_tensors = [] visual_tokens = [] - if self.is_multimodal and (img_urls or image_bytes or video_bytes): + # Pixtral supports text-only input + if self.is_multimodal and (img_urls or image_bytes or video_bytes + or self.model_type == 'pixtral'): assert self.vision_preprocessor != None, "Vision preprocessor for preparing images before encoding is None" processed_tensors = {} if self.model_type == 'mllama': @@ -317,6 +327,19 @@ class TritonPythonModel: qwen2vl_input_length_tensor = processed_tensors.get( "REQUEST_INPUT_LEN") processed_tensors.pop("REQUEST_INPUT_LEN") + elif self.model_type == 'pixtral': + image_sizes = pb_utils.get_input_tensor_by_name( + request, 'IMAGE_SIZES') + processed_tensors, visual_tokens = self.vision_preprocessor.pixtral_process( + queries=query.astype(str).tolist(), + img_urls=img_urls, + image_bytes=image_bytes, + image_sizes=image_sizes, + ) + pixtral_input_id_tensor = processed_tensors.pop("INPUT_IDS") + request_input_len = np.array( + [[len(input_ids_for_batch)] + for input_ids_for_batch in pixtral_input_id_tensor]) else: raise ValueError( "Unsupported model type for IMAGE_BYTES or IMAGE_URL inputs" @@ -330,8 +353,9 @@ class TritonPythonModel: # Preprocessing input data. # For the LLaVA_OneVision model, num_multimodal_features is not a fixed value - input_id, request_input_len = self._create_request( - query, visual_tokens) + if self.model_type != 'pixtral': + input_id, request_input_len = self._create_request( + query, visual_tokens) if decoder_query is not None: decoder_input_id, request_decoder_input_len = self._create_request( decoder_query) @@ -362,6 +386,13 @@ class TritonPythonModel: 'INPUT_ID', qwen2vl_input_id_tensor) request_input_len_tensor = pb_utils.Tensor.from_dlpack( 'REQUEST_INPUT_LEN', qwen2vl_input_length_tensor) + elif self.model_type == 'pixtral': + input_id_tensor = pb_utils.Tensor( + 'INPUT_ID', + pixtral_input_id_tensor.numpy().astype(self.input_id_dtype)) + request_input_len_tensor = pb_utils.Tensor( + 'REQUEST_INPUT_LEN', + request_input_len.astype(self.request_input_len_dtype)) else: input_id_tensor = pb_utils.Tensor( 'INPUT_ID', input_id.astype(self.input_id_dtype)) @@ -719,7 +750,10 @@ class VisionPreProcessor: def __init__(self, vision_model_type, vision_model_processor, - preprocessor_model_config={}): + preprocessor_model_config=None, + hf_config=None): + preprocessor_model_config = preprocessor_model_config or {} + # import libraries that are only relevant for multimodal models import torch from torch.utils.dlpack import from_dlpack @@ -767,6 +801,12 @@ class VisionPreProcessor: self.vision_model_processor = vision_model_processor self.vision_model_type = vision_model_type + if vision_model_type == 'pixtral': + assert hf_config is not None, "Pixtral model requires hf_config to be set" + self.vocab_size = hf_config.text_config.vocab_size + self.image_size = hf_config.vision_config.image_size + self.image_token_index = hf_config.image_token_index + def load_images_from_urls(self, img_urls): images = [] for img_url in img_urls: @@ -777,10 +817,11 @@ class VisionPreProcessor: image_data = base64.b64decode(image_base64) # Create a BytesIO object from the decoded data image_buffer = io.BytesIO(image_data) - images.append(Image.open(image_buffer)) + images.append(Image.open(image_buffer).convert("RGB")) else: - images.append(Image.open( - requests.get(img_url, stream=True).raw)) + images.append( + Image.open(requests.get(img_url, + stream=True).raw).convert("RGB")) return images def mllama_process(self, queries, img_urls=None, image_bytes=None): @@ -879,6 +920,9 @@ class VisionPreProcessor: mode='constant') for image in preprocessor_outputs['PIXEL_VALUES'] ] + # Add a dimension image_sizes to match the dimensions defined in config.pbtxt + for elem in preprocessor_outputs['IMAGE_SIZES']: + elem.unsqueeze_(1) for key, tensor_list in preprocessor_outputs.items(): val = self.convert_tensor_list_to_tensor(tensor_list) if key in self.output_str_dtypes: @@ -1001,3 +1045,130 @@ class VisionPreProcessor: val, self.output_str_dtypes[key]) vision_processed_tensors[key] = val return vision_processed_tensors + + def pixtral_process(self, + queries, + img_urls=None, + image_bytes=None, + image_sizes=None + ) -> Tuple[Dict[str, "torch.Tensor"], List[int]]: + import torch + vision_processed_tensors = {} + if img_urls is not None: + assert image_sizes is None, "IMAGE_SIZES should not be supplied together with IMAGE_URL" + # download and read images + images = [ + self.load_images_from_urls(urls) + for urls in img_urls.as_numpy() + ] + images = [[np.array(img) for img in batch] for batch in images] + + # pad to the max_h, max_w dimensions to create one tensor for all images + shapes = [img.shape for batch in images for img in batch] + assert all( + len(s) == 3 + for s in shapes), "All input images must have three dimensions" + assert all( + s[-1] == shapes[0][-1] for s in shapes + ), "All input images must have the same number of channels" + max_h, max_w = max(s[0] for s in shapes), max(s[1] for s in shapes) + for batch_idx in range(len(images)): + for image_idx in range(len(images[batch_idx])): + images[batch_idx][image_idx] = np.pad( + images[batch_idx][image_idx], + ((0, max_h - images[batch_idx][image_idx].shape[0]), + (0, max_w - images[batch_idx][image_idx].shape[1]), + (0, 0)), + mode='constant', + ) + images = np.array(images) + elif image_bytes is not None: + images = self.load_images_tensor(image_bytes) + else: + images = np.empty((len(queries), 0, 0, 0, 0), dtype=np.uint8) + + batch_size = len(images) + assert len( + queries + ) == batch_size, f"Image must have the same batch size as Query." + + if image_sizes is not None: + image_sizes = self.load_images_tensor(image_sizes) + else: + s = images.shape + image_sizes = np.array([[[s[2], s[3]]] * s[1]] * s[0]) + + preprocessor_outputs = {} + possible_output_names = ['PIXEL_VALUES', 'IMAGE_SIZES', 'INPUT_IDS'] + visual_tokens = [] + for batch_id in range(batch_size): + # Preprocess images and query + query = queries[batch_id] + if not isinstance(query, (str, bytes)): + query = query[0] + if isinstance(query, bytes): + query = query.decode("utf-8") + if "[IMG]" not in query: + query = "[IMG]" * len(images[batch_id]) + query + assert query.count("[IMG]") == len( + images[batch_id] + ), "Number of [IMG] tags must match number of images" + + if not query.startswith("[INST]"): + query = "[INST]" + query + if not query.endswith("[/INST]"): + query = query + "[/INST]" + + sizes = image_sizes[batch_id] + curr_images = [ + img[:sizes[idx][0], :sizes[idx][1], :] + for idx, img in enumerate(images[batch_id]) + ] + if not curr_images: + curr_images = None + + processed_vision_data = self.vision_model_processor( + images=curr_images, text=query, return_tensors="pt") + visual_tokens.append(processed_vision_data['input_ids'].shape[1]) + if "pixel_values" in processed_vision_data: + # Pad to self.image_size x self.image_size + processed_vision_data['pixel_values'] = torch.nn.functional.pad( + processed_vision_data['pixel_values'], ( + 0, + self.image_size - + processed_vision_data['pixel_values'].shape[-1], + 0, + self.image_size - + processed_vision_data['pixel_values'].shape[-2], + ), + mode='constant') + # Create vision output tensors + for key in possible_output_names: + val = processed_vision_data.get(key.lower()) + if val is not None: + if key not in preprocessor_outputs: + preprocessor_outputs[key] = [] + if key != 'INPUT_IDS': + val.unsqueeze_(0) # unsqueeze to add batch dimension + preprocessor_outputs[key].append(val) + + for key, tensor_list in preprocessor_outputs.items(): + val = self.convert_tensor_list_to_tensor(tensor_list) + if key in self.output_str_dtypes: + val = self.convert_tensor_to_str_dtype( + val, self.output_str_dtypes[key]) + vision_processed_tensors[key] = val + + # Replace all image tokens with a unique token_id > vocab_size. + # This shall be used to lookup the prompt table. + for batch_id in range(batch_size): + # Note: We reset replacer to vocab_size for each sample. This is as opposed to doing `replacer = vocab_size + img_idx * tokens_per_task`. + # That part of the look-up manipulation is done by the `task_ids` input to PromptEmbedding forward. + replacer = self.vocab_size + input_ids = vision_processed_tensors['INPUT_IDS'][batch_id] + for token_idx in range(len(input_ids)): + if input_ids[token_idx] == self.image_token_index: + input_ids[token_idx] = replacer + replacer += 1 + + return vision_processed_tensors, visual_tokens diff --git a/triton_backend/all_models/inflight_batcher_llm/preprocessing/config.pbtxt b/triton_backend/all_models/inflight_batcher_llm/preprocessing/config.pbtxt index b21585e4bd..ed819b7b60 100755 --- a/triton_backend/all_models/inflight_batcher_llm/preprocessing/config.pbtxt +++ b/triton_backend/all_models/inflight_batcher_llm/preprocessing/config.pbtxt @@ -55,7 +55,14 @@ input [ { name: "IMAGE_URL" data_type: TYPE_STRING - dims: [ 1 ] + dims: [ -1 ] + optional: true + }, + # Required for pixtral + { + name: "IMAGE_SIZES" + data_type: TYPE_INT64 + dims: [ -1, 2 ] optional: true }, { @@ -188,11 +195,11 @@ output [ data_type: TYPE_INT64 dims: [ -1, -1, -1 ] }, - # Required for image postprocessing in the llava_onevision model + # Required for image postprocessing in the llava_onevision and pixtral models { name: "IMAGE_SIZES" data_type: TYPE_INT64 - dims: [ 2 ] + dims: [ -1, 2 ] }, # Indicates if the input is video in the llava_onevision model { diff --git a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/1/model.py b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/1/model.py index 4f8863465b..ab165323c1 100755 --- a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/1/model.py +++ b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/1/model.py @@ -280,7 +280,9 @@ def get_prompt_tuning_config_from_request(request, kwargs = {} prompt_embedding_table = get_input_tensor_by_name(request, 'prompt_embedding_table', - batch_size, batch_index) + batch_size, + batch_index, + force_on_torch=True) prompt_table_extra_ids = get_input_tensor_by_name(request, 'prompt_table_extra_ids', batch_size, batch_index) diff --git a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt index 4f06581c04..f5f6cb41a4 100644 --- a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt +++ b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt @@ -319,7 +319,7 @@ input [ }, { name: "prompt_embedding_table" - data_type: TYPE_FP16 + data_type: ${prompt_embedding_table_data_type} dims: [ -1, -1 ] optional: true allow_ragged_batch: true diff --git a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py index 566e62cb7c..e6ed45d185 100644 --- a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py +++ b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py @@ -103,6 +103,7 @@ class Request: request_id: Optional[str] = None mrope_rotary_cos_sin: Optional[np.ndarray] = None mrope_position_deltas: Optional[np.ndarray] = None + image_sizes_input: Optional[np.ndarray] = None def validate(self): _validate_non_empty(self.text_input, "text_input is required") diff --git a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/triton_decoder.py b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/triton_decoder.py index 00bd315b13..2c9c5b8055 100644 --- a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/triton_decoder.py +++ b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/triton_decoder.py @@ -165,7 +165,12 @@ class TritonDecoder(Decoder): continue triton_name = tensor.name() if tensor.is_cpu(): - value = tensor.as_numpy() + try: + value = tensor.as_numpy() + except pb_utils.TritonModelException as e: + # Use to_dlpack()/from_dlpack() if as_numpy() fails, + # e.g. in case of BF16 tensors + value = from_dlpack(tensor.to_dlpack()) else: # If the tensor is in GPU memory make it torch.Tensor type value = from_dlpack(tensor.to_dlpack()) @@ -247,6 +252,7 @@ class TritonDecoder(Decoder): "text_input": "QUERY", "image_bytes_input": "IMAGE_BYTES", "image_url_input": "IMAGE_URL", + "image_sizes_input": "IMAGE_SIZES", "video_bytes_input": "VIDEO_BYTES", "decoder_text_input": "DECODER_QUERY", "max_tokens": "REQUEST_OUTPUT_LEN", diff --git a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/config.pbtxt b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/config.pbtxt index 7f38bf903a..7ad5ccf9f1 100644 --- a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/config.pbtxt +++ b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm_bls/config.pbtxt @@ -62,6 +62,13 @@ input [ dims: [ 1 ] optional: true }, + # An arbitrary number of images for pixtral + { + name: "image_sizes_input" + data_type: TYPE_INT64 + dims: [ -1, 2 ] + optional: true + }, { name: "video_bytes_input" data_type: TYPE_UINT8 @@ -199,7 +206,7 @@ input [ }, { name: "prompt_embedding_table" - data_type: TYPE_FP16 + data_type: ${prompt_embedding_table_data_type} dims: [ -1, -1 ] optional: true }, diff --git a/triton_backend/all_models/multimodal/ensemble/config.pbtxt b/triton_backend/all_models/multimodal/ensemble/config.pbtxt index d3affefabf..777118eefe 100755 --- a/triton_backend/all_models/multimodal/ensemble/config.pbtxt +++ b/triton_backend/all_models/multimodal/ensemble/config.pbtxt @@ -54,9 +54,16 @@ input [ { name: "image_url_input" data_type: TYPE_STRING - dims: [ 1 ] + dims: [ -1 ] optional: true }, + # An arbitrary number of images for pixtral + { + name: "image_sizes_input" + data_type: TYPE_INT64 + dims: [ -1, 2 ] + optional: true + }, { name: "video_bytes_input" data_type: TYPE_UINT8 @@ -253,6 +260,10 @@ ensemble_scheduling { key: "IMAGE_URL" value: "image_url_input" } + input_map { + key: "IMAGE_SIZES" + value: "image_sizes_input" + } input_map { key: "VIDEO_BYTES" value: "video_bytes_input" diff --git a/triton_backend/all_models/multimodal/multimodal_encoders/1/model.py b/triton_backend/all_models/multimodal/multimodal_encoders/1/model.py index acf601ac80..adb968d461 100755 --- a/triton_backend/all_models/multimodal/multimodal_encoders/1/model.py +++ b/triton_backend/all_models/multimodal/multimodal_encoders/1/model.py @@ -112,6 +112,8 @@ class TritonPythonModel: self.image_session = Session.from_serialized_engine(engine_buffer) self.vision_dtype_str = visual_config['builder_config']['precision'] + self.vision_max_batch_size = visual_config['builder_config'][ + 'max_batch_size'] features_output_name = "OUT_PROMPT_EMBEDDING_TABLE" if self.model_type == "mllama": features_output_name = "ENCODER_INPUT_FEATURES" @@ -162,7 +164,21 @@ class TritonPythonModel: self.vocab_size = hf_config.vocab_size self.qwen2vl_utils = Qwen2VLUtils(hf_config) - def get_requests(self, request: List) -> Dict[str, torch.Tensor]: + if self.model_type == 'pixtral': + from transformers import AutoConfig + hf_model_path = model_config['parameters'].get( + 'hf_model_path', None) + assert hf_model_path is not None and hf_model_path[ + 'string_value'] != "${hf_model_path}", "Need to provide hf_model_path for the Pixtral model" + hf_config = AutoConfig.from_pretrained( + hf_model_path['string_value']) + self.image_size = hf_config.vision_config.image_size + self.patch_size = hf_config.vision_config.patch_size + self.vocab_size = hf_config.text_config.vocab_size + self.spatial_merge_size = hf_config.spatial_merge_size + self.relevant_patch_size = self.patch_size * self.spatial_merge_size + + def get_requests(self, request) -> Dict[str, torch.Tensor]: """ Processes the incoming request to extract and organize input tensors for different model types. @@ -193,8 +209,10 @@ class TritonPythonModel: img_tensor = (pb_utils.get_input_tensor_by_name(request, 'pixel_values') or pb_utils.get_input_tensor_by_name(request, 'IMAGE')) - # mllama supports img_tensor is None case - assert img_tensor != None or self.model_type == 'mllama', "There is no preprocessed image tensor to encode" + # mllama and pixtral support img_tensor is None case + assert img_tensor != None or self.model_type in [ + 'mllama', 'pixtral' + ], "There is no preprocessed image tensor to encode" if img_tensor is not None: img_tensor = from_dlpack(img_tensor.to_dlpack()) @@ -242,6 +260,9 @@ class TritonPythonModel: image_sizes = from_dlpack( pb_utils.get_input_tensor_by_name( request, 'image_sizes').to_dlpack()) + # Remove dimension 1, which was added to match the dimensions defined in config.pbtxt + assert image_sizes.shape[1] == 1 + image_sizes.squeeze_(1) from transformers.models.llava_onevision.modeling_llava_onevision import \ image_size_to_num_patches image_num_patches = [ @@ -276,6 +297,33 @@ class TritonPythonModel: input_tensors['attention_mask_llm'].append(attention_mask) input_tensors['image_grid_thw'].append(image_grid_thw) + elif self.model_type == 'pixtral': + if img_tensor is None: + input_tensors['pixel_values'].append(None) + else: + assert batch_size == 1, "Only support batch size 1 for Pixtral, because each batch can contain a different number of images" + d_min = torch.finfo(self.vision_output_dtype).min + total_images = img_tensor.shape[0] * img_tensor.shape[1] + num_patches = self.image_size // self.patch_size + input_tensors['input'].append( + img_tensor.view(-1, img_tensor.shape[2], + img_tensor.shape[3], img_tensor.shape[4])) + attention_mask_shape = (total_images, num_patches, num_patches) + attention_mask = torch.full(attention_mask_shape, + fill_value=d_min, + dtype=self.vision_output_dtype, + device="cuda") + image_sizes = from_dlpack( + pb_utils.get_input_tensor_by_name( + request, + 'image_sizes').to_dlpack()).reshape(total_images, 2) + for image_idx in range(total_images): + image_h, image_w = image_sizes[image_idx][0], image_sizes[ + image_idx][1] + attention_mask[image_idx, :image_h // + self.patch_size, :image_w // + self.patch_size] = 0 + input_tensors['attention_mask'].append(attention_mask) else: input_tensors['input'].append( img_tensor.view(-1, img_tensor.shape[2], img_tensor.shape[3], @@ -408,7 +456,7 @@ class TritonPythonModel: f"encoder_output_lengths: {encoder_output_lengths}") # True when the request does not have image input - output_tensors = [ + response_tensors = [ pb_utils.Tensor.from_dlpack( 'ENCODER_INPUT_FEATURES', to_dlpack(encoder_input_features)), @@ -417,16 +465,16 @@ class TritonPythonModel: to_dlpack(encoder_output_lengths)) ] if cross_attention_mask is not None: - output_tensors.append( + response_tensors.append( pb_utils.Tensor.from_dlpack( 'CROSS_ATTENTION_MASK', to_dlpack(cross_attention_mask))) - output_tensors.append( + response_tensors.append( pb_utils.Tensor.from_dlpack( 'SKIP_CROSS_ATTN_BLOCKS', to_dlpack(skip_cross_attn_blocks))) inference_response = pb_utils.InferenceResponse( - output_tensors=output_tensors) + output_tensors=response_tensors) responses.append(inference_response) elif self.model_type == 'llava_onevision': for req_idx, embeddings in enumerate( @@ -443,6 +491,9 @@ class TritonPythonModel: image_sizes = from_dlpack( pb_utils.get_input_tensor_by_name( request, 'image_sizes').to_dlpack()) + # Remove dimension 1, which was added to match the dimensions defined in config.pbtxt + assert image_sizes.shape[1] == 1 + image_sizes.squeeze_(1) from transformers.models.llava_onevision.modeling_llava_onevision import \ image_size_to_num_patches image_num_patches = [ @@ -458,10 +509,10 @@ class TritonPythonModel: embeddings, image_sizes, image_num_patches) prompt_embedding_table_tensor = pb_utils.Tensor.from_dlpack( 'OUT_PROMPT_EMBEDDING_TABLE', to_dlpack(prompt_table)) - output_tensors = [prompt_embedding_table_tensor] + response_tensors = [prompt_embedding_table_tensor] inference_response = pb_utils.InferenceResponse( - output_tensors=output_tensors) + output_tensors=response_tensors) responses.append(inference_response) elif self.model_type == 'qwen2_vl': image_grid_thw = other_vision_input_tensors.get('image_grid_thw') @@ -493,12 +544,92 @@ class TritonPythonModel: 'MROPE_ROTARY_COS_SIN', to_dlpack(mrope_rotary_cos_sin)) mrope_position_deltas_tensor = pb_utils.Tensor.from_dlpack( 'MROPE_POSITION_DELTAS', to_dlpack(mrope_position_deltas)) - output_tensors = [ + response_tensors = [ prompt_embedding_table_tensor, mrope_rotary_cos_sin_tensor, mrope_position_deltas_tensor ] inference_response = pb_utils.InferenceResponse( - output_tensors=output_tensors) + output_tensors=response_tensors) + responses.append(inference_response) + elif self.model_type == 'pixtral': + assert len(num_images) == len(batch_sizes) == len( + is_skip_encoders) == len(requests) + images_per_batch = [i * b for i, b in zip(num_images, batch_sizes)] + split_along = np.cumsum(images_per_batch).tolist() + if output_tensor is not None: + splitted_output_tensor = torch.tensor_split(output_tensor, + split_along, + dim=0) + visual_embed_dim = output_tensor.shape[-1] + output_img_size = self.image_size // self.relevant_patch_size + + for req_idx, request in enumerate(requests): + if is_skip_encoders[req_idx]: + responses.append( + pb_utils.InferenceResponse(output_tensors=[])) + continue + + response_tensors = [] + assert splitted_output_tensor[req_idx].ndim == 3 + current_output_tensor = splitted_output_tensor[req_idx].reshape( + batch_sizes[req_idx], num_images[req_idx], + splitted_output_tensor[req_idx].shape[-2], + splitted_output_tensor[req_idx].shape[-1]) + image_sizes = from_dlpack( + pb_utils.get_input_tensor_by_name( + request, 'image_sizes').to_dlpack()) + complete_visual_features = [] + vocab_size = [] + for batch_idx in range(batch_sizes[req_idx]): + batch_visual_features = [] + for image_idx in range(num_images[req_idx]): + image_h = image_sizes[batch_idx][image_idx][0] + image_w = image_sizes[batch_idx][image_idx][1] + h_patches = image_h // self.relevant_patch_size + w_patches = image_w // self.relevant_patch_size + relevant_visual_features = torch.zeros( + 1, h_patches * w_patches, visual_embed_dim) + visual_features = current_output_tensor[batch_idx][ + image_idx].reshape(output_img_size, output_img_size, + visual_embed_dim) + flattened_features = visual_features[:h_patches, : + w_patches, :].flatten( + 0, 1) + relevant_visual_features[ + 0, :h_patches * w_patches, :] = flattened_features + batch_visual_features.append(relevant_visual_features) + batch_visual_features = torch.cat(batch_visual_features, + dim=1) + vocab_size.append(batch_visual_features.shape[1]) + complete_visual_features.append(batch_visual_features) + + # Pad elements of complete_visual_features to have the same shape[1], + # to allow concatenation over batch dimension + max_vocab_size = max(vocab_size) + for batch_idx in range(batch_sizes[req_idx]): + complete_visual_features[ + batch_idx] = torch.nn.functional.pad( + complete_visual_features[batch_idx], + (0, 0, 0, max_vocab_size - + complete_visual_features[batch_idx].shape[1]), + mode='constant') + complete_visual_features = torch.cat(complete_visual_features, + dim=0) + + prompt_embedding_table_tensor = pb_utils.Tensor.from_dlpack( + 'OUT_PROMPT_EMBEDDING_TABLE', + to_dlpack( + complete_visual_features.type( + self.vision_output_dtype))) + prompt_vocab_size_tensor = pb_utils.Tensor( + 'OUT_PROMPT_VOCAB_SIZE', + np.array(vocab_size, + dtype=np.int32).reshape(batch_sizes[req_idx], 1)) + + response_tensors.extend( + [prompt_embedding_table_tensor, prompt_vocab_size_tensor]) + inference_response = pb_utils.InferenceResponse( + output_tensors=response_tensors) responses.append(inference_response) else: for req_idx, embeddings in enumerate( @@ -530,17 +661,67 @@ class TritonPythonModel: prompt_vocab_size_tensor = pb_utils.Tensor( 'OUT_PROMPT_VOCAB_SIZE', prompt_vocab_size.astype(np.int32)) - output_tensors = [ + response_tensors = [ prompt_embedding_table_tensor, prompt_vocab_size_tensor ] inference_response = pb_utils.InferenceResponse( - output_tensors=output_tensors) + output_tensors=response_tensors) responses.append(inference_response) # You should return a list of pb_utils.InferenceResponse. Length # of this list must match the length of `requests` list. return responses + def run_vision_encoder(self, vit_input: Dict[str, + torch.Tensor]) -> torch.Tensor: + batch_size = [v.shape[0] for v in vit_input.values()] + assert all( + b == batch_size[0] + for b in batch_size), "Batch sizes of encoder inputs must match" + batch_size = batch_size[0] + + embeddings = [] + for start_idx in range(0, batch_size, self.vision_max_batch_size): + end_idx = min(start_idx + self.vision_max_batch_size, batch_size) + logger.debug( + f"Running encoder (max_batch_size={self.vision_max_batch_size}) " + + f"with batch indices {start_idx}:{end_idx} of {batch_size}.") + + # Slice the input tensors along the batch dimension + vit_input_batch = { + k: v[start_idx:end_idx] + for k, v in vit_input.items() + } + + # Set up output tensors + vit_input_info = [ + TensorInfo(key, torch_dtype_to_trt(val.dtype), val.shape) + for key, val in vit_input_batch.items() + ] + vit_output_info = self.image_session.infer_shapes(vit_input_info) + + vit_output_batch = { + t.name: + torch.empty(tuple(t.shape), + dtype=trt_dtype_to_torch(t.dtype), + device='cuda') + for t in vit_output_info + } + + # Run the vision encoder + with torch.cuda.stream(self.vision_stream): + ok = self.image_session.run(vit_input_batch, vit_output_batch, + self.vision_stream.cuda_stream) + assert ok, "Runtime execution failed for vision encoder session" + embeddings.append(vit_output_batch['encoder_output'].to( + self.vision_output_dtype)) + + with torch.cuda.stream(self.vision_stream): + embeddings = torch.cat(embeddings, dim=0) + + self.vision_stream.synchronize() + return embeddings + def execute(self, requests: List): """`execute` must be implemented in every Python model. `execute` function receives a list of pb_utils.InferenceRequest as the only @@ -664,28 +845,8 @@ class TritonPythonModel: vit_input['attention_mask'] = attention_mask_vit.to( str_dtype_to_torch(self.vision_dtype_str)).to('cuda') - # Set up output tensors - vit_input_info = [ - TensorInfo(key, torch_dtype_to_trt(val.dtype), val.shape) - for key, val in vit_input.items() - ] - vit_output_info = self.image_session.infer_shapes( - vit_input_info) - vit_output = { - t.name: - torch.empty(tuple(t.shape), - dtype=trt_dtype_to_torch(t.dtype), - device='cuda') - for t in vit_output_info - } - # Run the vision encoder - with torch.cuda.stream(self.vision_stream): - ok = self.image_session.run(vit_input, vit_output, - self.vision_stream.cuda_stream) - assert ok, "Runtime execution failed for vision encoder session" - embeddings = vit_output['encoder_output'].to( - self.vision_output_dtype) - self.vision_stream.synchronize() + embeddings = self.run_vision_encoder(vit_input) + # Post process output and save in responses responses.extend( self.postprocess_output_tensors(embeddings, diff --git a/triton_backend/all_models/multimodal/multimodal_encoders/config.pbtxt b/triton_backend/all_models/multimodal/multimodal_encoders/config.pbtxt index 715c491501..c2a79e01e7 100755 --- a/triton_backend/all_models/multimodal/multimodal_encoders/config.pbtxt +++ b/triton_backend/all_models/multimodal/multimodal_encoders/config.pbtxt @@ -72,13 +72,14 @@ input [ dims: [ 1 ] optional: true }, - # input tensors for llava_onevision + # Required for llava_onevision and pixtral { name: "image_sizes" data_type: TYPE_INT64 - dims: [ 2 ] + dims: [ -1, 2 ] optional: true }, + # Required for llava_onevision { name: "is_video_input" data_type: TYPE_BOOL @@ -114,7 +115,7 @@ input [ output [ { name: "OUT_PROMPT_EMBEDDING_TABLE" - data_type: TYPE_FP16 + data_type: ${prompt_embedding_table_data_type} dims: [ -1, -1 ] }, { diff --git a/triton_backend/all_models/multimodal/requirements-mistral3.1.txt b/triton_backend/all_models/multimodal/requirements-mistral3.1.txt new file mode 100644 index 0000000000..954e44483a --- /dev/null +++ b/triton_backend/all_models/multimodal/requirements-mistral3.1.txt @@ -0,0 +1 @@ +transformers>=4.50.0 diff --git a/triton_backend/all_models/tests/test_llmapi_python_backend.py b/triton_backend/all_models/tests/test_llmapi_python_backend.py index 6ef7cd9946..6ab4120aa4 100644 --- a/triton_backend/all_models/tests/test_llmapi_python_backend.py +++ b/triton_backend/all_models/tests/test_llmapi_python_backend.py @@ -64,6 +64,12 @@ class MockTritonTensor: else: return False + def to_dlpack(self): + if self.is_cpu(): + return self._tensor.__dlpack__() + else: + return self._tensor.to_dlpack() + @dataclass class MockTritonError: diff --git a/triton_backend/all_models/tests/test_python_backend.py b/triton_backend/all_models/tests/test_python_backend.py index b993af957f..8e17f2b09f 100644 --- a/triton_backend/all_models/tests/test_python_backend.py +++ b/triton_backend/all_models/tests/test_python_backend.py @@ -63,6 +63,12 @@ class MockTritonTensor: else: return False + def to_dlpack(self): + if self.is_cpu(): + return self._tensor.__dlpack__() + else: + return self._tensor.to_dlpack() + @dataclass class MockTritonError: diff --git a/triton_backend/all_models/tests/test_triton_decoder.py b/triton_backend/all_models/tests/test_triton_decoder.py index 7ebcb28e99..90f71107b7 100644 --- a/triton_backend/all_models/tests/test_triton_decoder.py +++ b/triton_backend/all_models/tests/test_triton_decoder.py @@ -64,6 +64,12 @@ class MockTritonTensor: else: return False + def to_dlpack(self): + if self.is_cpu(): + return self._tensor.__dlpack__() + else: + return self._tensor.to_dlpack() + @dataclass class MockTritonResponse: diff --git a/triton_backend/ci/L0_backend_trtllm/test.sh b/triton_backend/ci/L0_backend_trtllm/test.sh index 83967d1c58..272a208b53 100644 --- a/triton_backend/ci/L0_backend_trtllm/test.sh +++ b/triton_backend/ci/L0_backend_trtllm/test.sh @@ -197,6 +197,7 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do replace_config_tags '${max_queue_delay_microseconds}' "50000" "${MODEL_DIR}/tensorrt_llm/config.pbtxt" replace_config_tags '${triton_backend}' "tensorrtllm" "${MODEL_DIR}/tensorrt_llm/config.pbtxt" replace_config_tags '${encoder_input_features_data_type}' "TYPE_FP16" "${MODEL_DIR}/tensorrt_llm/config.pbtxt" + replace_config_tags '${prompt_embedding_table_data_type}' 'TYPE_FP16' "${MODEL_DIR}/tensorrt_llm/config.pbtxt" replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_DIR}/postprocessing/config.pbtxt" replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_DIR}/postprocessing/config.pbtxt" replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_DIR}/postprocessing/config.pbtxt" diff --git a/triton_backend/tools/multimodal/client.py b/triton_backend/tools/multimodal/client.py index bac2b4ef5b..b77de50e8d 100755 --- a/triton_backend/tools/multimodal/client.py +++ b/triton_backend/tools/multimodal/client.py @@ -6,6 +6,8 @@ import io import os import sys from datetime import datetime +from pathlib import Path +from typing import List, Tuple sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) @@ -19,8 +21,32 @@ from transformers import AutoProcessor, Blip2Processor from utils import utils +def pixtral_pad_images( + image_list: List[Image.Image]) -> Tuple[np.ndarray, np.ndarray]: + if not image_list: + return np.empty((0, 0, 0, 0), dtype=np.uint8), np.empty((0, 2), + dtype=np.int64) + image_list_np = [np.array(img) for img in image_list] + shapes = [img.shape for img in image_list_np] + assert all(len(s) == 3 + for s in shapes), "All input images must have three dimensions" + assert all(s[-1] == shapes[0][-1] for s in + shapes), "All input images must have the same number of channels" + max_h, max_w = max(s[0] for s in shapes), max(s[1] for s in shapes) + for i in range(len(image_list_np)): + image_list_np[i] = np.pad(image_list_np[i], + ((0, max_h - image_list_np[i].shape[0]), + (0, max_w - image_list_np[i].shape[1]), + (0, 0)), + mode='constant') + raw_image = np.stack(image_list_np, axis=0) + image_sizes = np.array([s[:2] for s in shapes], dtype=np.int64) + return raw_image, image_sizes + + def prepare_inputs(text_data, image_data, + image_sizes, request_output_len_data, beam_width_data, temperature_data, @@ -35,7 +61,6 @@ def prepare_inputs(text_data, image_input_name="image_input"): inputs = [ utils.prepare_tensor("text_input", text_data, grpcclient), - utils.prepare_tensor(image_input_name, image_data, grpcclient), utils.prepare_tensor("max_tokens", request_output_len_data, grpcclient), utils.prepare_tensor("beam_width", beam_width_data, grpcclient), utils.prepare_tensor("temperature", temperature_data, grpcclient), @@ -45,6 +70,14 @@ def prepare_inputs(text_data, utils.prepare_tensor("top_p", top_p_data, grpcclient), utils.prepare_tensor("stream", streaming_data, grpcclient), ] + if image_data is not None: + inputs += [ + utils.prepare_tensor(image_input_name, image_data, grpcclient), + ] + if image_sizes is not None: + inputs += [ + utils.prepare_tensor("image_sizes_input", image_sizes, grpcclient), + ] if repetition_penalty_data is not None: inputs += [ utils.prepare_tensor("repetition_penalty", repetition_penalty_data, @@ -63,20 +96,16 @@ def prepare_inputs(text_data, return inputs -def load_image(image_path): +def load_image(image_path) -> Image.Image: if image_path.startswith("http") or image_path.startswith("https"): - image = Image.open(requests.get(image_path, - stream=True).raw).convert("RGB") + image_bytes = requests.get(image_path, stream=True).content elif image_path.startswith("data:image/jpeg;base64,"): image_base64 = image_path.split(",")[1] - # Decode the base64 string - image_data = base64.b64decode(image_base64) - # Create a BytesIO object from the decoded data - image_buffer = io.BytesIO(image_data) - image = Image.open(image_buffer).convert("RGB") + image_bytes = base64.b64decode(image_base64) else: - image = Image.open(image_path).convert("RGB") - return image + image_bytes = Path(image_path).read_bytes() + + return Image.open(io.BytesIO(image_bytes)).convert("RGB") def load_video(video_path, num_of_frames): @@ -239,7 +268,7 @@ if __name__ == "__main__": required=True, choices=[ 'blip2', 'llava', 'vila', 'mllama', - 'llava_onevision', 'qwen2_vl' + 'llava_onevision', 'qwen2_vl', 'pixtral' ], help="Model type") parser.add_argument("--hf_model_dir", @@ -249,11 +278,18 @@ if __name__ == "__main__": help="path to the model directory") FLAGS = parser.parse_args() # load and process images or video + image_sizes = np.empty((0, 2), dtype=np.int64) if 'vila' in FLAGS.model_type: image_paths = FLAGS.image.split(",") raw_image = [] for image_path in image_paths: raw_image.append(load_image(image_path)) + elif 'pixtral' in FLAGS.model_type: + image_paths = FLAGS.image.split(",") if FLAGS.image else [] + raw_image = [] + for image_path in image_paths: + raw_image.append(load_image(image_path)) + raw_image, image_sizes = pixtral_pad_images(raw_image) elif FLAGS.video is not None: assert FLAGS.video_num_frames is not None, "Number of frames should be provided for video input." raw_video = load_video(FLAGS.video, FLAGS.video_num_frames) @@ -303,6 +339,9 @@ if __name__ == "__main__": FLAGS.text = image_tag + FLAGS.text image_data = np.array([[raw_image]]) image_input_name = "image_bytes_input" + elif 'pixtral' in FLAGS.model_type: + image_data = np.array([raw_image]) + image_input_name = "image_bytes_input" elif 'llava_onevision' in FLAGS.model_type: if FLAGS.video is not None: image_data = np.array([raw_video]) @@ -334,6 +373,9 @@ if __name__ == "__main__": temperature_data = np.array(temperature, dtype=np.float32) streaming = [[FLAGS.streaming]] streaming_data = np.array(streaming, dtype=bool) + image_data = None if image_data.size == 0 else image_data + image_sizes_data = None if image_sizes.size == 0 else np.array( + [image_sizes], dtype=np.int64) model_name = "ensemble" if FLAGS.use_bls: @@ -356,6 +398,7 @@ if __name__ == "__main__": inputs = prepare_inputs(text_data, image_data, + image_sizes_data, request_output_len_data, beam_width_data, temperature_data, From 344bc4575d01caabbed298069abe4eefd899f466 Mon Sep 17 00:00:00 2001 From: Emma Qiao Date: Fri, 22 Aug 2025 00:08:55 +0800 Subject: [PATCH 13/33] [None][infra] Waive failed case for main branch (#7129) Signed-off-by: qqiao --- tests/integration/test_lists/waives.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 86af57819b..fb7a7a50b9 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -319,3 +319,4 @@ full:H100/accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle full:H100/accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=False] SKIP (https://nvbugs/5467815) full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True] SKIP (https://nvbugs/5467815) full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True] SKIP (https://nvbugs/5467815) +accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5470769) From e18dacc931daa303720c1bd539314711faa103e8 Mon Sep 17 00:00:00 2001 From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com> Date: Thu, 21 Aug 2025 10:30:36 -0700 Subject: [PATCH 14/33] [#4403][refactor] Move fusion, kvcache, and compile to modular inference optimizer (#7057) Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com> Co-authored-by: h-guo18 <67671475+h-guo18@users.noreply.github.com> --- .../_torch/auto_deploy/config/default.yaml | 49 +- .../_torch/auto_deploy/transform/interface.py | 6 +- .../transform/library/collectives.py | 204 +++++++ .../transform/library/compile_model.py | 65 +++ .../library/fusion.py | 58 +- .../auto_deploy/transform/library/kvcache.py | 299 ++++++++++ .../auto_deploy/transform/library/rms_norm.py | 148 +++++ .../transformations/library/__init__.py | 6 - .../transformations/library/collectives.py | 167 ------ .../transformations/library/fused_moe.py | 511 ------------------ .../transformations/library/kvcache.py | 193 ------- .../transformations/library/rms_norm.py | 113 ---- .../auto_deploy/transformations/transform.py | 134 ++--- .../test_allreduce_residual_rmsnorm_fusion.py | 21 +- .../library/test_collective_fusion.py | 19 +- .../library/test_fuse_rmsnorm.py | 30 +- .../library/test_gemm_fusion.py | 19 +- .../transformations/library/test_kv_cache.py | 117 ++-- 18 files changed, 969 insertions(+), 1190 deletions(-) create mode 100644 tensorrt_llm/_torch/auto_deploy/transform/library/collectives.py create mode 100644 tensorrt_llm/_torch/auto_deploy/transform/library/compile_model.py rename tensorrt_llm/_torch/auto_deploy/{transformations => transform}/library/fusion.py (76%) create mode 100644 tensorrt_llm/_torch/auto_deploy/transform/library/kvcache.py create mode 100644 tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py delete mode 100644 tensorrt_llm/_torch/auto_deploy/transformations/library/collectives.py delete mode 100644 tensorrt_llm/_torch/auto_deploy/transformations/library/fused_moe.py delete mode 100644 tensorrt_llm/_torch/auto_deploy/transformations/library/kvcache.py delete mode 100644 tensorrt_llm/_torch/auto_deploy/transformations/library/rms_norm.py diff --git a/tensorrt_llm/_torch/auto_deploy/config/default.yaml b/tensorrt_llm/_torch/auto_deploy/config/default.yaml index f7ad7934a9..041d51e73d 100644 --- a/tensorrt_llm/_torch/auto_deploy/config/default.yaml +++ b/tensorrt_llm/_torch/auto_deploy/config/default.yaml @@ -19,6 +19,11 @@ transforms: stage: post_export cleanup_input_constraints: stage: post_export + ############################################################################################ + # RUN PATTERN MATCHER TRANSFORMATIONS TO STANDARDIZE GRAPH REPRESENTATION + ############################################################################################ + match_moe_pattern: + stage: pattern_matcher match_repeat_kv: stage: pattern_matcher match_eager_attention: @@ -27,12 +32,13 @@ transforms: stage: pattern_matcher match_attention_layout: stage: pattern_matcher - match_moe_pattern: - stage: pattern_matcher match_rope_pattern: stage: pattern_matcher match_rope_layout: stage: pattern_matcher + ############################################################################################ + # RUN TRANSFORMATIONS ON STANDARDIZED GRAPH REPRESENTATION + ############################################################################################ eliminate_redundant_transposes: stage: pattern_matcher # TODO (lucaslie): let's move this to perf optimization once TP sharding is improved @@ -57,5 +63,44 @@ transforms: sharding_transform_executor: stage: sharding run_shape_prop: true + ############################################################################################ + # MOVE MODEL AND LOAD WEIGHTS + ############################################################################################ load_weights: stage: weight_load + ############################################################################################ + # RUN POST-LOAD FUSION AND OPTIMIZATIONS + ############################################################################################ + # TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/4674 this is causing OOMs + # fuse_moe: + # stage: post_load_fusion + # fuse_gemms: + # stage: post_load_fusion + fuse_allreduce_residual_rmsnorm: + stage: post_load_fusion + fuse_collectives: + stage: post_load_fusion + # TODO (lucaslie): add backend selection as part of configurable inference optimizers + # check if we can fuse rmsnorm + fuse_rmsnorm: + stage: post_load_fusion + backend: flashinfer + ############################################################################################ + # SWITCH TO CACHED+FLATTENED ATTENTION + INITIALIZE CACHES + ############################################################################################ + update_in_out_nodes: + stage: cache_init + insert_cached_attention: + stage: cache_init + insert_cached_mla_attention: + stage: cache_init + attn_backend: MultiHeadLatentAttention + initialize_cache: + stage: cache_init + resize_kv_cache: + stage: cache_init + ############################################################################################ + # COMPILE MODEL + ############################################################################################ + compile_model: + stage: compile diff --git a/tensorrt_llm/_torch/auto_deploy/transform/interface.py b/tensorrt_llm/_torch/auto_deploy/transform/interface.py index 1087714177..cddc56b872 100644 --- a/tensorrt_llm/_torch/auto_deploy/transform/interface.py +++ b/tensorrt_llm/_torch/auto_deploy/transform/interface.py @@ -54,6 +54,7 @@ class SharedConfig(BaseModel): sharding_config: ShardingConfig = Field(default_factory=ShardingConfig) local_rank: int = Field(default=0) world_size: int = Field(default=1) + attn_backend: str = Field(default="flashinfer", description="The attention backend to use.") class TransformConfig(BaseModel): @@ -285,7 +286,10 @@ class BaseTransform(ABC): # update + store new meta data history[t_name] = info autodeploy_meta[self._history_key] = history - self._set_autodeploy_meta(gm, autodeploy_meta) + + if isinstance(gm, GraphModule): + # After compilation, gm becomes type CapturedGraph with no meta data. + self._set_autodeploy_meta(gm, autodeploy_meta) # return the graph module return gm diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/collectives.py b/tensorrt_llm/_torch/auto_deploy/transform/library/collectives.py new file mode 100644 index 0000000000..6c5b1fe2b9 --- /dev/null +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/collectives.py @@ -0,0 +1,204 @@ +import operator +from typing import Tuple + +import torch +from torch.fx import GraphModule + +from ...distributed.trtllm import is_trtllm_op_available +from ...models.factory import ModelFactory +from ...shim.interface import CachedSequenceInterface +from ...utils.node_utils import get_op_overload_packet, get_user_if_pattern_match, is_op +from ..interface import BaseTransform, SharedConfig, TransformInfo, TransformRegistry + +# TODO: This is an overly simplified model that works well for vanilla Llama models. +# However, we eventually want to consider more sophisticated patterns such as +# * all_reduce(lin1(x) + lin2(x)) +# * version above with fused GEMMs (i.e. with a split node) +# * all_reduce(pointwise_op(linear(x))) +# * ... + + +@TransformRegistry.register("fuse_collectives") +class FuseCollectives(BaseTransform): + """ + Fuses all_reduce ops with preceding (quantized) linear ops into a single fused node for improved performance. + """ + + def _apply( + self, + gm: GraphModule, + cm: CachedSequenceInterface, + factory: ModelFactory, + shared_config: SharedConfig, + ) -> Tuple[GraphModule, TransformInfo]: + num_gemm_collective_fusions = 0 + + # lookup for fused ops + # TODO: avoid this hardcoded lookup, e.g., by generating fused ops on the fly. + lookup = { + torch.ops.auto_deploy.torch_linear_simple: torch.ops.auto_deploy.trtllm_dist_fused_linear_all_reduce, + torch.ops.aten.linear: torch.ops.auto_deploy.trtllm_dist_fused_linear_all_reduce, + torch.ops.auto_deploy.torch_quant_fp8_linear: torch.ops.auto_deploy.torch_quant_fused_fp8_linear_all_reduce, + } + + # go through all nodes and find all_reduce nodes + for node in gm.graph.nodes: + if not is_op(node, torch.ops.auto_deploy.torch_dist_all_reduce): + continue + + # check if args are as expected + assert len(node.args) == 1 and not len(node.kwargs), ( + "Unexpected args/kwargs for all_reduce" + ) + + # retrieve parent and check a few conditions on the parent node + parent_node = node.args[0] + if not is_op(parent_node, lookup.keys()): + continue + if len(parent_node.users) > 1: + continue + + with gm.graph.inserting_before(node): + # insert fused node + fused_linear_collective_node = gm.graph.call_function( + lookup[get_op_overload_packet(parent_node.target)], + args=parent_node.args, + kwargs=parent_node.kwargs, + ) + node.replace_all_uses_with(fused_linear_collective_node) + gm.graph.erase_node(node) + gm.graph.erase_node(parent_node) + num_gemm_collective_fusions += 1 + + info = TransformInfo( + skipped=False, + num_matches=num_gemm_collective_fusions, + is_clean=False, + has_valid_shapes=False, + ) + + return gm, info + + +@TransformRegistry.register("fuse_allreduce_residual_rmsnorm") +class FuseAllreduceResidualRMSNorm(BaseTransform): + """Essentially, this transformation fuses the following operators into one allreduce trtllm implementation. + + * target pattern: + x = all_reduce(x) + y = x + residual + return rmsnorm(y), y + * replacement: + fused_allreduce_residual_rmsnorm(x, residual, rmsnorm_weight, rmsnorm_eps) + + """ + + def _apply( + self, + gm: GraphModule, + cm: CachedSequenceInterface, + factory: ModelFactory, + shared_config: SharedConfig, + ) -> Tuple[GraphModule, TransformInfo]: + if not is_trtllm_op_available(): + return gm, TransformInfo( + skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True + ) + + num_ar_r_rms_fusions = 0 + + def trace_and_fuse(allreduce_node, graph): + # Check if all_reduce is followed by addition + users = list(allreduce_node.users.keys()) + if len(users) != 1: + return # Skip if all_reduce has more than one consumer + add_node = users[0] + + # Traverse nodes for RMSNorm pattern which is composed of to_copy, pow, mean, add, refer + # the Huggingface LlamaRMSNorm implementation as example for more details + to_copy_1 = get_user_if_pattern_match(add_node, [torch.ops.aten.add, operator.add], 2) + # operand of pow and mul + pow_node = get_user_if_pattern_match( + to_copy_1, [torch.ops.aten._to_copy, torch.ops.aten.to], 2 + ) + mean_node = get_user_if_pattern_match(pow_node, torch.ops.aten.pow, 1) + add_eps_node = get_user_if_pattern_match(mean_node, torch.ops.aten.mean, 1) + rsqrt_node = get_user_if_pattern_match( + add_eps_node, [torch.ops.aten.add, operator.add], 1 + ) + mul_node_1 = get_user_if_pattern_match(rsqrt_node, torch.ops.aten.rsqrt, 1) + to_copy_2 = get_user_if_pattern_match(mul_node_1, torch.ops.aten.mul, 1) + mul_node_2 = get_user_if_pattern_match( + to_copy_2, [torch.ops.aten._to_copy, torch.ops.aten.to], 1 + ) + # check args of ops: pow(2) and mean(-1) + ARGS_MATCH = pow_node is not None and pow_node.args[1] == 2 # exponent + ARGS_MATCH &= mean_node is not None and mean_node.args[1] == [-1] # dimensions + + # Match found: Replace with fused operation + if ( + to_copy_1 + and pow_node + and mean_node + and add_eps_node + and rsqrt_node + and mul_node_1 + and to_copy_2 + and mul_node_2 + and ARGS_MATCH + ): + # Gather the inputs for the custom operation + tensor = allreduce_node.args[0] + # Identify the residual argument in the add operation + # One of the args in add_node.args is the output of all_reduce + # The same idea also applies to norm_weight + residual = ( + add_node.args[0] if add_node.args[1] is allreduce_node else add_node.args[1] + ) + norm_weight = ( + mul_node_2.args[0] if mul_node_2.args[1] is to_copy_2 else mul_node_2.args[1] + ) + eps = add_eps_node.args[1] + + # Insert nodes + with graph.inserting_before(allreduce_node): + fused_node = graph.call_function( + torch.ops.dist.fused_allreduce_residual_rmsnorm, + args=( + tensor, + residual, + norm_weight, + eps, + ), + ) + # Extract outputs from the tuple returned by `fused_node` + final_output_node = gm.graph.create_node( + "call_function", + target=operator.getitem, + args=(fused_node, 0), + ) + add_output_node = gm.graph.create_node( + "call_function", + target=operator.getitem, + args=(fused_node, 1), + ) + + # Replace all uses of rmsnorm_node with final_output_node + mul_node_2.replace_all_uses_with(final_output_node) + + # Replace all uses of add_node with add_output_node + add_node.replace_all_uses_with(add_output_node) + + nonlocal num_ar_r_rms_fusions + num_ar_r_rms_fusions += 1 + + # Traverse all nodes + for node in gm.graph.nodes: + if is_op(node, torch.ops.auto_deploy.torch_dist_all_reduce): + trace_and_fuse(allreduce_node=node, graph=gm.graph) + + info = TransformInfo( + skipped=False, num_matches=num_ar_r_rms_fusions, is_clean=False, has_valid_shapes=False + ) + + return gm, info diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/compile_model.py b/tensorrt_llm/_torch/auto_deploy/transform/library/compile_model.py new file mode 100644 index 0000000000..00601303b6 --- /dev/null +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/compile_model.py @@ -0,0 +1,65 @@ +from typing import List, Literal, Optional, Tuple, Type + +from pydantic import Field +from torch.fx import GraphModule + +from ...compile import compile_and_capture +from ...models.factory import ModelFactory +from ...shim.interface import CachedSequenceInterface +from ..interface import ( + BaseTransform, + SharedConfig, + TransformConfig, + TransformInfo, + TransformRegistry, +) + + +class CompileModelConfig(TransformConfig): + """Configuration for the compile model transform.""" + + cuda_graph_batch_sizes: Optional[List[int]] = Field( + default=None, description="The batch sizes to use for CUDA graphs." + ) + num_batched_inputs: int = Field( + default=2, description="The number of batched inputs to use for CUDA graphs." + ) + compile_backend: Literal["torch-simple", "torch-compile", "torch-cudagraph", "torch-opt"] = ( + Field(description="The backend to use for compiling the model.") + ) + + +@TransformRegistry.register("compile_model") +class CompileModel(BaseTransform): + """A transform to compile the model.""" + + config: CompileModelConfig + + @classmethod + def get_config_class(cls) -> Type[TransformConfig]: + return CompileModelConfig + + def _apply( + self, + gm: GraphModule, + cm: CachedSequenceInterface, + factory: ModelFactory, + shared_config: SharedConfig, + ) -> Tuple[GraphModule, TransformInfo]: + cm.info.set_generate_only_batch() + egm_compiled = compile_and_capture( + gm, + self.config.compile_backend, + args=cm.args, + dynamic_shapes=cm.dynamic_shapes, + compiler_kwargs={ + "cuda_graph_batch_sizes": self.config.cuda_graph_batch_sizes, + "num_batched_inputs": self.config.num_batched_inputs, + }, + ) + cm.info.reset() + + # store info object about the transform + info = TransformInfo(skipped=False, num_matches=1, is_clean=True, has_valid_shapes=True) + + return egm_compiled, info diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/fusion.py b/tensorrt_llm/_torch/auto_deploy/transform/library/fusion.py similarity index 76% rename from tensorrt_llm/_torch/auto_deploy/transformations/library/fusion.py rename to tensorrt_llm/_torch/auto_deploy/transform/library/fusion.py index e66ced8ae6..2d422c42d6 100644 --- a/tensorrt_llm/_torch/auto_deploy/transformations/library/fusion.py +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/fusion.py @@ -6,6 +6,8 @@ import torch import torch.nn as nn from torch.fx import GraphModule, Node +from ...models.factory import ModelFactory +from ...shim.interface import CachedSequenceInterface from ...utils.cuda_mem_tracker import cuda_memory_tracker from ...utils.logger import ad_logger from ...utils.node_utils import ( @@ -14,7 +16,7 @@ from ...utils.node_utils import ( is_linear_op, ) from ...utils.quantization_utils import QuantizationImpl -from .._graph import canonicalize_graph +from ..interface import BaseTransform, SharedConfig, TransformInfo, TransformRegistry def _insert_fused_gemm(gm: GraphModule, idx: int, parent_node: Node, linear_nodes: List[Node]): @@ -116,30 +118,36 @@ def _insert_fused_gemm(gm: GraphModule, idx: int, parent_node: Node, linear_node gm.delete_all_unused_submodules() -def fuse_gemms(gm: GraphModule) -> None: - ad_logger.info("GEMM fusion") - ad_logger.debug("Before GEMM fusion: " + str(gm)) - # sort linear nodes by parent node - linear_nodes = defaultdict(list) - for node in gm.graph.nodes: - # TODO: we don't handle bias for now... - if is_linear_op(node, include_quantization=True) and node.args[2] is None: - linear_nodes[node.args[0]].append(node) +@TransformRegistry.register("fuse_gemms") +class FuseGemms(BaseTransform): + def _apply( + self, + gm: GraphModule, + cm: CachedSequenceInterface, + factory: ModelFactory, + shared_config: SharedConfig, + ) -> Tuple[GraphModule, TransformInfo]: + # sort linear nodes by parent node + linear_nodes = defaultdict(list) + for node in gm.graph.nodes: + # TODO: we don't handle bias for now... + if is_linear_op(node, include_quantization=True) and node.args[2] is None: + linear_nodes[node.args[0]].append(node) - # fuse linear nodes - idx = -1 - with cuda_memory_tracker(): - for parent_node, lin_children in linear_nodes.items(): - if len(lin_children) < 2: - continue - # linear nodes to fuse - ad_logger.debug( - f"Found linear nodes to fuse: {lin_children} with parent node: {parent_node}" - ) - _insert_fused_gemm(gm, idx := idx + 1, parent_node, lin_children) + # fuse linear nodes + idx = -1 + num_matches = 0 + with cuda_memory_tracker(): + for parent_node, lin_children in linear_nodes.items(): + if len(lin_children) < 2: + continue + # linear nodes to fuse + _insert_fused_gemm(gm, idx := idx + 1, parent_node, lin_children) + num_matches += 1 - # clean up and return - canonicalize_graph(gm) + torch.cuda.empty_cache() - ad_logger.debug("After GEMM fusion: " + str(gm)) - torch.cuda.empty_cache() + info = TransformInfo( + skipped=False, num_matches=num_matches, is_clean=False, has_valid_shapes=False + ) + return gm, info diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache.py b/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache.py new file mode 100644 index 0000000000..80f9d440c1 --- /dev/null +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache.py @@ -0,0 +1,299 @@ +"""Graph transformation to automatically add kv cache into fused MHA op.""" + +import operator +from typing import Dict, Optional, Tuple, Type + +import torch +from pydantic import Field +from torch.fx import Graph, GraphModule, Node + +from ...custom_ops.attention_interface import AttentionRegistry +from ...distributed.common import all_gather_object, get_world_size +from ...models.factory import ModelFactory +from ...shim.interface import CachedSequenceInterface +from ...transformations._graph import add_graph_input +from ...utils.logger import ad_logger +from ...utils.node_utils import get_all_input_output_nodes, is_op +from ..interface import ( + BaseTransform, + SharedConfig, + TransformConfig, + TransformInfo, + TransformRegistry, +) + + +@TransformRegistry.register("update_in_out_nodes") +class UpdateInOutNodes(BaseTransform): + """Modify the graph module by adding new input nodes. + + The new input nodes correspond to the extra arguments needed for cached and flattened attention. + + Args: + egm: The graph module to analyze and modify. + cm: Cached sequence interface containing extra argument information. + """ + + def _apply( + self, + gm: GraphModule, + cm: CachedSequenceInterface, + factory: ModelFactory, + shared_config: SharedConfig, + ) -> Tuple[GraphModule, TransformInfo]: + # loop through nodes to get input, output, and get_attr nodes + input_nodes, output_nodes = get_all_input_output_nodes(gm.graph) + + # we only expect one input node + assert len(input_nodes) == 2, "Expected exactly two input nodes (input_ids, position_ids)." + + # NOTE: for now, we wanna make sure we *only* return the final output and no hidden states. + # Later on, we can revisit how to support returning hidden states. + assert len(output_nodes) == 1, "Expected exactly one output node!" + assert len(output_nodes[0].all_input_nodes) == 1, ( + "Expected to only return final tensor output!" + ) + + # Activate and add extra argument nodes + new_args = cm.info.switch_to_cached_attn_inputs() + for name in new_args: + input_nodes.append(add_graph_input(gm, name)) + + info = TransformInfo(skipped=False, num_matches=1, is_clean=False, has_valid_shapes=False) + + return gm, info + + +class InsertCachedAttentionConfig(TransformConfig): + """Configuration for the insert cached attention transform.""" + + attn_backend: Optional[str] = Field(default=None, description="The attention backend to use.") + + +@TransformRegistry.register("insert_cached_attention") +class InsertCachedAttention(BaseTransform): + """ + A transform to insert cached attention into the graph module. + + If attn_backend is not provided in transform config, will find from shared config. + """ + + config: InsertCachedAttentionConfig + + @classmethod + def get_config_class(cls) -> Type[TransformConfig]: + return InsertCachedAttentionConfig + + def _apply( + self, + gm: GraphModule, + cm: CachedSequenceInterface, + factory: ModelFactory, + shared_config: SharedConfig, + ) -> Tuple[GraphModule, TransformInfo]: + """Replace uncached source attention node with corresponding cached attn node.""" + attn_descriptor = AttentionRegistry.get(self.config.attn_backend) + + cache_config = factory.get_cache_config() + + # Get all attention nodes and their info objects + source_op = attn_descriptor.get_source_attention_op() + + # pick up graph + graph: Graph = gm.graph + + # look for relevant source attention nodes + source_attn_nodes = [n for n in graph.nodes if is_op(n, source_op)] + + if not source_attn_nodes: + # If there are no nodes for kv cache insertion found, return current graph + return gm, TransformInfo( + skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True + ) + + # Sanity check + if cm.info.is_paged: + assert attn_descriptor.is_paged(), "Paged sequence info requires paged attention op." + + # retrieve input nodes + input_nodes, _ = get_all_input_output_nodes(gm.graph) + + # insert metadata computation and extract each argument as a node + get_metadata, num_metadata = attn_descriptor.get_prepare_metadata_op() + with graph.inserting_before(input_nodes[-1].next): + ret_node = graph.call_function( + get_metadata, + args=( + *input_nodes, + cm.info.page_size, + ), + ) + metadata_nodes = [ + graph.call_function(operator.getitem, args=(ret_node, idx)) + for idx in range(num_metadata) + ] + + buffer_in_lookup: Dict[str, Node] = {} + + # replace fused attention node with attention node that has kv cache + num_cached_attn_replacements = 0 + for idx, attn_node in enumerate(source_attn_nodes): + # pick out GEMMs + qkv = attn_node.args[: attn_descriptor.get_num_qkv_args()] + + # setup + store cache initializers and caches as input nodes + cache_in_nodes = [] + for k, get_cache in attn_descriptor.get_cache_initializers( + attn_node, cache_config + ).items(): + k_indexed = f"{k}_{idx}" + cm.add_cache(k_indexed, get_cache) + cache_in_nodes.append(add_graph_input(gm, k_indexed)) + + # setup + store global buffer initializers and buffers as input nodes + # NOTE: we have to check against existing keys to make sure nothing is registered twice... + buffer_in_nodes = [] + for k, get_buffer in attn_descriptor.get_global_buffer_initializers(attn_node).items(): + if k not in buffer_in_lookup: + cm.add_cache(k, get_buffer) + buffer_in_lookup[k] = add_graph_input(gm, k) + buffer_in_nodes.append(buffer_in_lookup[k]) # store buffer nodes for this op + + # retrieve constants for attention_op + constants = attn_descriptor.get_constants(attn_node) + + # insert cached attention replacement op + with graph.inserting_before(attn_node): + cached_attn_node = graph.call_function( + attn_descriptor.get_cached_attention_op(), + args=(*qkv, *metadata_nodes, *cache_in_nodes, *buffer_in_nodes, *constants), + ) + attn_node.replace_all_uses_with(cached_attn_node) + graph.erase_node(attn_node) + num_cached_attn_replacements += 1 + + info = TransformInfo( + skipped=False, + num_matches=num_cached_attn_replacements, + is_clean=False, + has_valid_shapes=False, + ) + + return gm, info + + +@TransformRegistry.register("insert_cached_mla_attention") +class InsertCachedMLAAttention(InsertCachedAttention): + """ + A transform to insert cached MLA attention into the graph module. + + This class is identical to InsertCachedAttention and inherits all its behavior. + """ + + pass + + +class ResizeKVCacheConfig(TransformConfig): + """Configuration for the resize kv cache transform.""" + + free_mem_ratio: float = Field( + description="The fraction of available memory to occupy.", default=0.8 + ) + + +@TransformRegistry.register("resize_kv_cache") +class ResizeKVCache(BaseTransform): + """Inflate the kv cache to occupy the available GPU memory. + + free_mem_ratio specifies the fraction of available memory to occupy. + """ + + config: ResizeKVCacheConfig + + @classmethod + def get_config_class(cls) -> Type[TransformConfig]: + return ResizeKVCacheConfig + + def _apply( + self, + gm: GraphModule, + cm: CachedSequenceInterface, + factory: ModelFactory, + shared_config: SharedConfig, + ) -> Tuple[GraphModule, TransformInfo]: + free_mem_ratio = self.config.free_mem_ratio + + def _get_mem_info_in_mb(): + free_mem, total_mem = torch.cuda.mem_get_info() + return free_mem // 1024**2, total_mem // 1024**2 + + free_mem, total_mem = _get_mem_info_in_mb() + ad_logger.info(f"Free memory (MB): {free_mem}, Total memory (MB): {total_mem}") + current_cache_size = cm.current_cache_size_bytes() + current_num_pages = cm.info.num_pages + ad_logger.info( + f"Current cache size: {current_cache_size}, Current num pages: {current_num_pages}" + ) + + if free_mem_ratio == 0.0: + ad_logger.info(f"Skipping cache resize for {free_mem_ratio=}") + return gm, TransformInfo( + skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True + ) + + try: + # Let's run a forward pass to get the memory usage + cm.info._set_max_num_tokens_sample() + free_mem_pre, _ = _get_mem_info_in_mb() + ad_logger.info(f"Free memory before forward pass (MB): {free_mem_pre}") + + gm(*cm.args) + + free_mem_post, _ = _get_mem_info_in_mb() + ad_logger.info(f"Free memory after forward pass (MB): {free_mem_post}") + + memory_for_forward_pass = free_mem_pre - free_mem_post + ad_logger.info(f"Memory for forward pass (MB): {memory_for_forward_pass}") + + new_cache_size = free_mem_post * 1024 * 1024 * free_mem_ratio + current_cache_size + new_num_pages = int(new_cache_size // (current_cache_size // current_num_pages)) + + # Need to sync all the GPUs + gathered_num_pages = [None] * get_world_size() + all_gather_object(gathered_num_pages, new_num_pages) + new_num_pages = min(gathered_num_pages) + ad_logger.info(f"After all_gather - new_num_pages: {new_num_pages}") + + cm.resize_cache(new_num_pages) + except Exception as e: + ad_logger.warning( + f"Error encountered while resizing kv cache: {e}.\nSkipping cache resize." + ) + + # Free memory + torch.cuda.empty_cache() + + info = TransformInfo( + skipped=False, + num_matches=0, + is_clean=True, + has_valid_shapes=True, + ) + + return gm, info + + +@TransformRegistry.register("initialize_cache") +class InitializeCache(BaseTransform): + def _apply( + self, + gm: GraphModule, + cm: CachedSequenceInterface, + factory: ModelFactory, + shared_config: SharedConfig, + ) -> Tuple[GraphModule, TransformInfo]: + cm.initialize_caches() + + info = TransformInfo(skipped=False, num_matches=1, is_clean=True, has_valid_shapes=True) + + return gm, info diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py b/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py new file mode 100644 index 0000000000..1772037d93 --- /dev/null +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py @@ -0,0 +1,148 @@ +"""Graph transform to optimize RMSNorm execution using FlashInfer.""" + +from functools import partial +from typing import Tuple, Type + +import torch +from pydantic import Field +from torch.fx import GraphModule + +from ...models.factory import ModelFactory +from ...shim.interface import CachedSequenceInterface + +# It is important to import ADPatternMatcherPass from pattern_matcher.py, not from torch._inductor.pattern_matcher +from ...utils.pattern_matcher import ADPatternMatcherPass, register_ad_pattern +from ..interface import ( + BaseTransform, + SharedConfig, + TransformConfig, + TransformInfo, + TransformRegistry, +) + +_BACKEND_OPS = { + "flashinfer": torch.ops.auto_deploy.flashinfer_rms_norm, + "triton": torch.ops.auto_deploy.triton_rms_norm, + "torch": torch.ops.auto_deploy.torch_rmsnorm, +} + + +def _rms_norm_pattern(data: torch.Tensor, weight: torch.Tensor, eps: float) -> torch.Tensor: + """Implements the RMSNorm pattern for pattern matching. + + Args: + data: Input tensor to normalize. + weight: Scaling weights for the normalized output. + eps: Small constant for numerical stability. + + Returns: + Normalized and scaled tensor. + """ + input_dtype = data.dtype + data = data.to(torch.float32) + variance = data.pow(2).mean(-1, keepdim=True) + data = data * torch.rsqrt(variance + eps) + return weight * data.to(input_dtype) + + +def _rms_norm_replacement( + data: torch.Tensor, weight: torch.Tensor, eps: float, backend: str +) -> torch.Tensor: + """Backend-specific rms_norm implementation. + + Args: + data: Input tensor to normalize. + weight: Scaling weights for the normalized output. + eps: Small constant for numerical stability. + backend: Backend to use for RMSNorm computation ("flashinfer" or "triton"). + + Returns: + Normalized and scaled tensor using the specified backend implementation. + """ + + assert backend.lower() in _BACKEND_OPS, ( + f"Invalid {backend=}; must be one of {list(_BACKEND_OPS)}" + ) + return _BACKEND_OPS[backend.lower()](data, weight, eps) + + +class FuseRMSNormConfig(TransformConfig): + """Configuration for the RMSNorm fusion transform.""" + + backend: str = Field( + default="flashinfer", + description="Backend to use for RMSNorm computation ('flashinfer' or 'triton').", + ) + + +@TransformRegistry.register("fuse_rmsnorm") +class FuseRMSNorm(BaseTransform): + """Matches and replaces RMSNorm patterns in the graph with FlashInfer or Triton implementation. + + This function sets up pattern matching to identify RMSNorm operations in the graph + and replaces them with optimized implementations. It uses dummy tensors to register + the pattern matching rules. + + Args: + gm: Input graph module to transform. + backend: Backend to use for RMSNorm computation ("flashinfer" or "triton"). + + Returns: + Transformed graph module with optimized RMSNorm operations. + """ + + config: FuseRMSNormConfig + + @classmethod + def get_config_class(cls) -> Type[TransformConfig]: + return FuseRMSNormConfig + + def _apply( + self, + gm: GraphModule, + cm: CachedSequenceInterface, + factory: ModelFactory, + shared_config: SharedConfig, + ) -> Tuple[GraphModule, TransformInfo]: + if self.config.backend.lower() not in _BACKEND_OPS: + raise ValueError( + f"Invalid backend, must be one of {list(_BACKEND_OPS)}, got {self.config.backend}" + ) + + graph = gm.graph + patterns = ADPatternMatcherPass() + + # Create dummy tensors for pattern matching + bs = 2 + hidden_size = 512 + + def dummy_args(input_dtype: torch.dtype, weight_dtype: torch.dtype, eps: float = 1e-6): + return [ + torch.randn(bs, hidden_size, device="cuda", dtype=input_dtype), + torch.randn(hidden_size, device="cuda", dtype=weight_dtype), + eps, + ] + + # Define configurations for different data types + configs = [ + (torch.bfloat16, torch.bfloat16), + (torch.float16, torch.float16), + (torch.float32, torch.float32), + ] + + # Register patterns for each configuration + for input_dtype, weight_dtype in configs: + register_ad_pattern( + search_fn=_rms_norm_pattern, + replace_fn=partial(_rms_norm_replacement, backend=self.config.backend), + patterns=patterns, + dummy_args=dummy_args(input_dtype, weight_dtype), + op_ignore_types={}, + scalar_workaround={"eps": 1e-6}, + ) + + cnt = patterns.apply(graph) + + info = TransformInfo(skipped=False, num_matches=cnt, is_clean=False, has_valid_shapes=False) + + return gm, info diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/__init__.py b/tensorrt_llm/_torch/auto_deploy/transformations/library/__init__.py index 0d4c388ebc..e5260ada48 100644 --- a/tensorrt_llm/_torch/auto_deploy/transformations/library/__init__.py +++ b/tensorrt_llm/_torch/auto_deploy/transformations/library/__init__.py @@ -1,11 +1,5 @@ """A library of transformation passes.""" -from .collectives import * -from .fused_moe import * -from .fusion import * -from .kvcache import * -from .rms_norm import * - try: from .visualization import visualize_namespace except ImportError: diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/collectives.py b/tensorrt_llm/_torch/auto_deploy/transformations/library/collectives.py deleted file mode 100644 index 8cec047561..0000000000 --- a/tensorrt_llm/_torch/auto_deploy/transformations/library/collectives.py +++ /dev/null @@ -1,167 +0,0 @@ -import operator - -import torch -from torch.fx import GraphModule - -from ...distributed.trtllm import is_trtllm_op_available -from ...utils.logger import ad_logger -from ...utils.node_utils import get_op_overload_packet, get_user_if_pattern_match, is_op -from .._graph import canonicalize_graph - - -# TODO: This is an overly simplified model that works well for vanilla Llama models. -# However, we eventually want to consider more sophisticated patterns such as -# * all_reduce(lin1(x) + lin2(x)) -# * version above with fused GEMMs (i.e. with a split node) -# * all_reduce(pointwise_op(linear(x))) -# * ... -def fuse_collectives(gm: GraphModule) -> None: - num_gemm_collective_fusions = 0 - ad_logger.debug("Before GEMM+Collective fusion: " + str(gm)) - - # lookup for fused ops - # TODO: avoid this hardcoded lookup, e.g., by generating fused ops on the fly. - lookup = { - torch.ops.auto_deploy.torch_linear_simple: torch.ops.auto_deploy.trtllm_dist_fused_linear_all_reduce, - torch.ops.aten.linear: torch.ops.auto_deploy.trtllm_dist_fused_linear_all_reduce, - torch.ops.auto_deploy.torch_quant_fp8_linear: torch.ops.auto_deploy.torch_quant_fused_fp8_linear_all_reduce, - } - - # go through all nodes and find all_reduce nodes - for node in gm.graph.nodes: - if not is_op(node, torch.ops.auto_deploy.torch_dist_all_reduce): - continue - - # check if args are as expected - assert len(node.args) == 1 and not len(node.kwargs), "Unexpected args/kwargs for all_reduce" - - # retrieve parent and check a few conditions on the parent node - parent_node = node.args[0] - if not is_op(parent_node, lookup.keys()): - continue - if len(parent_node.users) > 1: - continue - - with gm.graph.inserting_before(node): - # insert fused node - fused_linear_collective_node = gm.graph.call_function( - lookup[get_op_overload_packet(parent_node.target)], - args=parent_node.args, - kwargs=parent_node.kwargs, - ) - node.replace_all_uses_with(fused_linear_collective_node) - gm.graph.erase_node(node) - gm.graph.erase_node(parent_node) - num_gemm_collective_fusions += 1 - - canonicalize_graph(gm) - ad_logger.info(f"Found {num_gemm_collective_fusions} GEMM+Collective fusions") - ad_logger.debug("After GEMM+Collective fusion: " + str(gm)) - - -def fuse_allreduce_residual_rmsnorm(gm: GraphModule) -> None: - """Essentially, this function fuses the following operators into one allreduce trtllm implementation. - - * target pattern: - x = all_reduce(x) - y = x + residual - return rmsnorm(y), y - * replacement: - fused_allreduce_residual_rmsnorm(x, residual, rmsnorm_weight, rmsnorm_eps) - - """ - if not is_trtllm_op_available(): - return - - num_ar_r_rms_fusions = 0 - ad_logger.debug("Before allreduce+residual+rmsnorm fusion: " + str(gm)) - - def trace_and_fuse(allreduce_node, graph): - # Check if all_reduce is followed by addition - users = list(allreduce_node.users.keys()) - if len(users) != 1: - return # Skip if all_reduce has more than one consumer - add_node = users[0] - - # Traverse nodes for RMSNorm pattern which is composed of to_copy, pow, mean, add, refer - # the Huggingface LlamaRMSNorm implementation as example for more details - to_copy_1 = get_user_if_pattern_match(add_node, [torch.ops.aten.add, operator.add], 2) - # operand of pow and mul - pow_node = get_user_if_pattern_match( - to_copy_1, [torch.ops.aten._to_copy, torch.ops.aten.to], 2 - ) - mean_node = get_user_if_pattern_match(pow_node, torch.ops.aten.pow, 1) - add_eps_node = get_user_if_pattern_match(mean_node, torch.ops.aten.mean, 1) - rsqrt_node = get_user_if_pattern_match(add_eps_node, [torch.ops.aten.add, operator.add], 1) - mul_node_1 = get_user_if_pattern_match(rsqrt_node, torch.ops.aten.rsqrt, 1) - to_copy_2 = get_user_if_pattern_match(mul_node_1, torch.ops.aten.mul, 1) - mul_node_2 = get_user_if_pattern_match( - to_copy_2, [torch.ops.aten._to_copy, torch.ops.aten.to], 1 - ) - # check args of ops: pow(2) and mean(-1) - ARGS_MATCH = pow_node is not None and pow_node.args[1] == 2 # exponent - ARGS_MATCH &= mean_node is not None and mean_node.args[1] == [-1] # dimensions - - # Match found: Replace with fused operation - if ( - to_copy_1 - and pow_node - and mean_node - and add_eps_node - and rsqrt_node - and mul_node_1 - and to_copy_2 - and mul_node_2 - and ARGS_MATCH - ): - # Gather the inputs for the custom operation - tensor = allreduce_node.args[0] - # Identify the residual argument in the add operation - # One of the args in add_node.args is the output of all_reduce - # The same idea also applies to norm_weight - residual = add_node.args[0] if add_node.args[1] is allreduce_node else add_node.args[1] - norm_weight = ( - mul_node_2.args[0] if mul_node_2.args[1] is to_copy_2 else mul_node_2.args[1] - ) - eps = add_eps_node.args[1] - - # Insert nodes - with graph.inserting_before(allreduce_node): - fused_node = graph.call_function( - torch.ops.dist.fused_allreduce_residual_rmsnorm, - args=( - tensor, - residual, - norm_weight, - eps, - ), - ) - # Extract outputs from the tuple returned by `fused_node` - final_output_node = gm.graph.create_node( - "call_function", - target=operator.getitem, - args=(fused_node, 0), - ) - add_output_node = gm.graph.create_node( - "call_function", - target=operator.getitem, - args=(fused_node, 1), - ) - - # Replace all uses of rmsnorm_node with final_output_node - mul_node_2.replace_all_uses_with(final_output_node) - - # Replace all uses of add_node with add_output_node - add_node.replace_all_uses_with(add_output_node) - - nonlocal num_ar_r_rms_fusions - num_ar_r_rms_fusions += 1 - - # Traverse all nodes - for node in gm.graph.nodes: - if is_op(node, torch.ops.auto_deploy.torch_dist_all_reduce): - trace_and_fuse(allreduce_node=node, graph=gm.graph) - - canonicalize_graph(gm) - ad_logger.info(f"Found {num_ar_r_rms_fusions} allreduce+residual+rmsnorm fusions") - ad_logger.debug("After allreduce+residual+rmsnorm fusion: " + str(gm)) diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/fused_moe.py b/tensorrt_llm/_torch/auto_deploy/transformations/library/fused_moe.py deleted file mode 100644 index e049970862..0000000000 --- a/tensorrt_llm/_torch/auto_deploy/transformations/library/fused_moe.py +++ /dev/null @@ -1,511 +0,0 @@ -from collections import defaultdict -from typing import Optional - -import torch -from torch.fx import GraphModule, Node - -from ...utils.cuda_mem_tracker import cuda_memory_tracker -from ...utils.logger import ad_logger -from ...utils.node_utils import bfs, identify_regions_between_residuals, is_linear_op, is_op -from ...utils.quantization_utils import get_scales_and_type_from_node -from .._graph import canonicalize_graph - - -def match_moe_pattern(gm: GraphModule) -> None: - graph = gm.graph - - ad_logger.debug("Before MoE Pattern Matching: " + str(gm)) - # Preprocessing: Identify boundary nodes (e.g. residual connections) in the graph. - boundary_nodes = identify_regions_between_residuals(gm) - - num_moe_patterns = 0 - - for start_boundary, end_boundary in zip(boundary_nodes[:-1], boundary_nodes[1:]): - # Step 1: Identify Expert Compute pattern - (pattern_input_nodes, pattern_output_nodes, expert_weights, expert_scales, weight_type) = ( - _match_expert_compute_pattern(start_boundary, end_boundary) - ) - if not expert_weights: - continue - # TODO: naming convention to verify the order of the weight nodes - - # Step 2: Trace upwards to locate normalize_routing_weight and selected_experts: - arg1_list, arg2_list = _extract_index_branches_from_expert_outputs(pattern_output_nodes) - normalized_routing_weights = _find_lowest_common_ancessor(arg1_list) - if not normalized_routing_weights: - continue - - common_ancessor2 = _find_lowest_common_ancessor(arg2_list) - if not common_ancessor2: - continue - selected_experts = bfs( - common_ancessor2, - lambda node: is_op(node, torch.ops.aten.one_hot), - attr_next="all_input_nodes", - boundary=start_boundary, - ).args[0] - if not selected_experts: - continue - - # Step 3: Trace upwards to find input node: - hidden_states = _find_lowest_common_ancessor(pattern_input_nodes) - if not hidden_states: - continue - - # Step 4: Find output node with the combine pattern - final_hidden_state_node = _find_final_hidden_state_node(pattern_output_nodes, end_boundary) - if final_hidden_state_node is None: - continue - - # Step 5: Insert the MoE op into the graph. - ad_logger.debug( - f"Found MoE Pattern: between boundary {start_boundary} and {end_boundary}.\n" - f"Input hidden states node: {hidden_states}, " - f"selected_experts node: {selected_experts}, " - f"routing_weights node: {normalized_routing_weights}, " - f"expert weights: {expert_weights}, weight type: {weight_type}" - ) - with graph.inserting_before(final_hidden_state_node): - w1_list = expert_weights["w1"] - w2_list = expert_weights["w2"] - w3_list = expert_weights["w3"] - - if weight_type == "fp8": - fused_moe_node = graph.call_function( - torch.ops.auto_deploy.torch_quant_fp8_moe, - args=( - hidden_states, - selected_experts, - normalized_routing_weights, - w1_list, - w2_list, - w3_list, - expert_scales["w1_input_scale"], - expert_scales["w2_input_scale"], - expert_scales["w3_input_scale"], - expert_scales["w1_weight_scale"], - expert_scales["w2_weight_scale"], - expert_scales["w3_weight_scale"], - ), - ) - elif weight_type == "fp4": - fused_moe_node = graph.call_function( - torch.ops.auto_deploy.torch_quant_fp4_moe, - args=( - hidden_states, - selected_experts, - normalized_routing_weights, - w1_list, - w2_list, - w3_list, - expert_scales["w1_input_scale"], - expert_scales["w2_input_scale"], - expert_scales["w3_input_scale"], - expert_scales["w1_weight_scale"], - expert_scales["w2_weight_scale"], - expert_scales["w3_weight_scale"], - expert_scales["w1_alpha"], - expert_scales["w2_alpha"], - expert_scales["w3_alpha"], - ), - ) - else: - fused_moe_node = graph.call_function( - torch.ops.auto_deploy.torch_moe, - args=( - hidden_states, - selected_experts, - normalized_routing_weights, - w1_list, - w2_list, - w3_list, - ), - ) - - final_hidden_state_node.replace_all_uses_with(fused_moe_node) - graph.erase_node(final_hidden_state_node) - - while _remove_dead_inplace_nodes_in_region(gm.graph, start_boundary, end_boundary): - gm.graph.eliminate_dead_code() - - num_moe_patterns += 1 - - canonicalize_graph(gm) - - ad_logger.info(f"Found {num_moe_patterns} MoE Patterns") - ad_logger.debug("After MoE Pattern Matching: " + str(gm)) - - -def fuse_moe(gm: torch.fx.GraphModule) -> None: - """ - Scan the FX graph and replace all calls to torch.ops.auto_deploy.torch_moe with - torch.ops.auto_deploy.trtllm_moe_fused. - """ - ad_logger.debug("Before MoE fusion: " + str(gm)) - - with cuda_memory_tracker(): - fused_key_counter = _insert_fused_moe_ops(gm) - if fused_key_counter: - canonicalize_graph(gm) - - ad_logger.info(f"Found {fused_key_counter} MoE fusions") - ad_logger.debug("After MoE fusion: " + str(gm)) - - -def _insert_fused_moe_ops(gm: GraphModule) -> int: - fused_key_counter = 0 - graph = gm.graph - - for node in list(graph.nodes): - if not is_op(node, torch.ops.auto_deploy.torch_moe): - continue - - ad_logger.debug(f"Found MoE op to fuse: {node} with args: {node.args}") - hidden_states, selected_experts, routing_weights, w1_list, w2_list, w3_list = node.args - - fused_w3_w1_experts = torch.stack( - [ - torch.cat( - [gm.get_parameter(w3_node.target), gm.get_parameter(w1_node.target)], dim=-2 - ) - for w1_node, w3_node in zip(w1_list, w3_list) - ], - dim=0, - ) - - fused_w2_experts = torch.stack([gm.get_parameter(n.target) for n in w2_list], dim=0) - - new_key_w3_w1 = f"fused_moe_w3_w1_stacked_{fused_key_counter}" - new_key_w2 = f"fused_moe_w2_stacked_{fused_key_counter}" - fused_key_counter += 1 - param_w3_w1 = torch.nn.Parameter(fused_w3_w1_experts) - param_w2 = torch.nn.Parameter(fused_w2_experts) - gm.register_parameter(new_key_w3_w1, param_w3_w1) - gm.register_parameter(new_key_w2, param_w2) - - with graph.inserting_before(node): - new_node = graph.call_function( - # TODO(Fridah-nv): torch.ops.auto_deploy.trtllm_moe_fused for quantized models - torch.ops.auto_deploy.trtllm_moe_fused, - args=( - hidden_states, - selected_experts, - routing_weights, - graph.get_attr(new_key_w3_w1), - graph.get_attr(new_key_w2), - ), - ) - - node.replace_all_uses_with(new_node) - graph.erase_node(node) - - return fused_key_counter - - -def _find_lowest_common_ancessor(nodes: list[Node]) -> Optional[Node]: - """ - Find the lowest common ancestor for a list of nodes in a torch.fx Graph by following - each node's primary branch (recursively following the first Node argument). - - It first finds the LCA of the first two nodes and then - iteratively computes the LCA of the result with the next node, and so on. - - Returns: - The common ancestor Node if found, otherwise None. - """ - if not nodes: - return None - - def get_parent(node: Node) -> Optional[Node]: - """Return the first Node-valued argument for a given node, or None if not found.""" - for arg in node.args: - if isinstance(arg, Node): - return arg - return None - - def get_depth(node: Node) -> int: - """ - Recursively compute the depth of the node by following its primary branch. - Depth is defined as the number of steps to reach a node with no parent. - """ - parent = get_parent(node) - if parent is None: - return 0 - return 1 + get_depth(parent) - - def lca_two(a: Node, b: Node) -> Optional[Node]: - """ - Find the lowest common ancestor of two nodes by first equalizing their depth - and then moving upward until a common node is found. - """ - depth_a = get_depth(a) - depth_b = get_depth(b) - - # Equalize depths - while depth_a > depth_b: - a = get_parent(a) - depth_a -= 1 - while depth_b > depth_a: - b = get_parent(b) - depth_b -= 1 - - # Walk upward in lockstep - while a is not None and b is not None: - if a is b: - return a - a = get_parent(a) - b = get_parent(b) - return None - - # Iteratively compute the LCA across all nodes. - common = nodes[0] - for node in nodes[1:]: - common = lca_two(common, node) - if common is None: - return None - - return common - - -def _extract_linear_parameters(linear_node: Node) -> tuple[Node, torch.Tensor, Optional[dict], str]: - """ - Given a linear op node, extract the input tensor node, weight tensor, - any quantization scales (if the op is quantized), and return a weight type. - - For a torch.ops.auto_deploy.torch_linear_simple.default op: - - Returns (input_node, weight, None, "simple") - - For a torch.ops.auto_deploy.torch_quant_fp8_linear op: - - Returns (input_node, weight, {"input_scale": input_scale, "weight_scale": weight_scale}, "fp8") - For a torch.ops.auto_deploy.torch_quant_fp4_linear op: - - Returns (input_node, weight, {"input_scale": input_scale, "weight_scale": weight_scale, "alpha": alpha}, "fp4") - """ - input_node = linear_node.args[0] - if is_op(linear_node, torch.ops.auto_deploy.torch_linear_simple): - weight = linear_node.args[1] - return input_node, weight, None, "" - elif { - is_op(linear_node, torch.ops.auto_deploy.torch_quant_fp4_linear), - is_op(linear_node, torch.ops.auto_deploy.torch_quant_fp8_linear), - }: - weight = linear_node.args[1] - scales, quant_type = get_scales_and_type_from_node(linear_node) - return input_node, weight, scales, quant_type - - -def _match_expert_compute_pattern(start_boundary: Node, end_boundary: Node): - """ - Match the expert compute pattern between the given boundaries. - - The expert compute pattern corresponds to: - - (F.silu(x @ w1.t()) * (x @ w3.t())) @ w2.t() - - For each expert, the function extracts the input node from the w1 branch and - collects the weight parameters from three linear ops (w1, w3, and w2 branches). - - This function supports both: - - torch.ops.auto_deploy.torch_linear_simple.default ops, and - - torch.ops.auto_deploy.torch_quant_fp8_linear ops (also extracts quantization scales). - - torch.ops.auto_deploy.torch_quant_fp4_linear ops (also extracts quantization scales). - - Returns: - A tuple: - (pattern_input_nodes, pattern_output_nodes, expert_weights, expert_scales, weight_type) - - - pattern_input_nodes: List of input nodes (x) used for the expert compute. - - pattern_output_nodes: List of final expert output nodes (the linear op with weight w2). - - expert_weights: Dict with keys "w1", "w2", "w3" mapping to lists of weight tensors. - - expert_scales: Dict with keys "w1_input_scale", "w1_weight_scale", etc., containing scale tensors - (empty if weight_type is "simple"). - - weight_type: "fp8" if FP8 ops were used, "simple" otherwise. - """ - pattern_input_nodes, pattern_output_nodes = [], [] - expert_weights = defaultdict(list) - expert_scales = defaultdict(list) - weight_type = "simple" # default - - nodes = list(start_boundary.graph.nodes) - region_nodes = nodes[nodes.index(start_boundary) + 1 : nodes.index(end_boundary)] - - for node in region_nodes: - # Accept both simple and quantized linear ops. - if not is_linear_op(node, include_quantization=True): - continue - - final_linear = node - if not final_linear.args or not isinstance(final_linear.args[0], Node): - continue - - mul_node = final_linear.args[0] - if not is_op(mul_node, torch.ops.aten.mul) or len(mul_node.args) < 2: - continue - - arg_a, arg_b = mul_node.args[:2] - silu_node = ( - arg_a - if is_op(arg_a, torch.ops.aten.silu) - else arg_b - if is_op(arg_b, torch.ops.aten.silu) - else None - ) - if silu_node is None: - continue - - if not (silu_node.args and is_linear_op(silu_node.args[0], include_quantization=True)): - continue - linear_w1_node = silu_node.args[0] - - # The other branch should be a linear op (w3 branch). - linear_w3_node = arg_b if arg_a is silu_node else arg_a - if not is_linear_op(linear_w3_node, include_quantization=True): - continue - if not (linear_w1_node.args and linear_w3_node.args): - continue - - # Extract parameters from each linear op. - input_node_w1, weight_w1, quant_params_w1, wt_type_w1 = _extract_linear_parameters( - linear_w1_node - ) - _, weight_w3, quant_params_w3, wt_type_w3 = _extract_linear_parameters(linear_w3_node) - _, weight_w2, quant_params_w2, wt_type_w2 = _extract_linear_parameters(final_linear) - - if None in (weight_w1, weight_w3, weight_w2): - continue - - # Ensure the weight type is consistent across branches. - if wt_type_w1 != wt_type_w3 or wt_type_w1 != wt_type_w2: - continue - weight_type = wt_type_w1 - - pattern_input_nodes.append(input_node_w1) - pattern_output_nodes.append(final_linear) - expert_weights["w1"].append(weight_w1) - expert_weights["w3"].append(weight_w3) - expert_weights["w2"].append(weight_w2) - - # TODO: sanity check that all experts have same weight type - if weight_type == "fp8": - expert_scales["w1_input_scale"].append(quant_params_w1["input_scale"]) - expert_scales["w1_weight_scale"].append(quant_params_w1["weight_scale"]) - expert_scales["w3_input_scale"].append(quant_params_w3["input_scale"]) - expert_scales["w3_weight_scale"].append(quant_params_w3["weight_scale"]) - expert_scales["w2_input_scale"].append(quant_params_w2["input_scale"]) - expert_scales["w2_weight_scale"].append(quant_params_w2["weight_scale"]) - elif weight_type == "fp4": - expert_scales["w1_input_scale"].append(quant_params_w1["input_scale"]) - expert_scales["w1_weight_scale"].append(quant_params_w1["weight_scale"]) - expert_scales["w1_alpha"].append(quant_params_w1["alpha"]) - expert_scales["w3_input_scale"].append(quant_params_w3["input_scale"]) - expert_scales["w3_weight_scale"].append(quant_params_w3["weight_scale"]) - expert_scales["w3_alpha"].append(quant_params_w3["alpha"]) - expert_scales["w2_input_scale"].append(quant_params_w2["input_scale"]) - expert_scales["w2_weight_scale"].append(quant_params_w2["weight_scale"]) - expert_scales["w2_alpha"].append(quant_params_w2["alpha"]) - - return pattern_input_nodes, pattern_output_nodes, expert_weights, expert_scales, weight_type - - -def _find_final_hidden_state_node( - pattern_output_nodes: list[Node], end_boundary: Node -) -> Optional[Node]: - """ - Identify the final hidden state node corresponding to the combine pattern: - - (expert_output * routing_weight) → index_add_ - - For each expert output node (from the expert compute pattern), this function: - 1. Retrieves a multiplication node from its users. - 2. Extracts the second argument from the multiplication node (assumed to be the index node). - 3. Uses a BFS to locate the subsequent index_add_ node (guarded by the end_boundary). - - After collecting all such index_add_ nodes, the final hidden state node is determined - as the one that is not used by any of the other index_add_ nodes. - - If any required attribute (users or args) is missing during the process or if no valid - final node is found, the function returns None. - """ - - if not pattern_output_nodes: - return None - - index_add_nodes = [] - for node in pattern_output_nodes: - if not node.users: - return None - mul_node = next(iter(node.users)) - if not (hasattr(mul_node, "args") and len(mul_node.args) >= 2): - return None - index_node = mul_node.args[1] - index_add_node = bfs( - index_node, lambda n: is_op(n, torch.ops.aten.index_add_), boundary=end_boundary - ) - if not index_add_node: - return None - index_add_nodes.append(index_add_node) - - # The final node is defined as the index_add_node that is not used by any other index_add_nodes - return next( - ( - candidate - for candidate in index_add_nodes - if not any( - candidate in other.args for other in index_add_nodes if candidate is not other - ) - ), - None, - ) - - -def _extract_index_branches_from_expert_outputs( - pattern_output_nodes: list[Node], -) -> tuple[list[Node], list[Node]]: - """ - Extract routing and experts branches from expert outputs. - - For each expert output, find its multiplication user. From the - multiplication node's second argument (an index node), - extract: - - The first argument as the routing branch. - - The second argument (flattened if a list/tuple) as the experts branch. - - Returns: - A tuple (routing_branches, experts_branches). - """ - routing_branches, experts_branches = [], [] - for out in pattern_output_nodes: - mul = next((u for u in out.users if is_op(u, torch.ops.aten.mul)), None) - if not mul or len(mul.args) < 2: - continue - idx_node = mul.args[1] - if not is_op(idx_node, torch.ops.aten.index): - continue - routing_branches.append(idx_node.args[0]) - experts = idx_node.args[1] - experts_branches.extend(experts) if isinstance( - experts, (list, tuple) - ) else experts_branches.append(experts) - return routing_branches, experts_branches - - -def _remove_dead_inplace_nodes_in_region( - graph: torch.fx.Graph, - start_boundary: torch.fx.Node, - end_boundary: torch.fx.Node, -) -> bool: - """ - Searches (via BFS) for a dead in-place node (index_add_) in the region - between start_boundary and end_boundary. If one is found, it is removed from the graph. - Returns True if a node was removed, False otherwise. - """ - - def target(n: torch.fx.Node) -> bool: - return is_op(n, {torch.ops.aten.index_add_}) and len(n.users) == 0 - - try: - node_to_remove = bfs(start_boundary, target, attr_next="users", boundary=end_boundary) - ad_logger.debug(f"Removing In-place Dead Node: {node_to_remove}") - graph.erase_node(node_to_remove) - return True - except RuntimeError: - return False diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/kvcache.py b/tensorrt_llm/_torch/auto_deploy/transformations/library/kvcache.py deleted file mode 100644 index 618c8108f8..0000000000 --- a/tensorrt_llm/_torch/auto_deploy/transformations/library/kvcache.py +++ /dev/null @@ -1,193 +0,0 @@ -"""Graph transformation to automatically add kv cache into fused MHA op.""" - -import operator -from typing import Dict, Type - -import torch -from torch.fx import Graph, GraphModule, Node - -from ...custom_ops.attention_interface import AttentionDescriptor, CacheConfig -from ...distributed.common import all_gather_object, get_world_size -from ...shim.interface import CachedSequenceInterface -from ...utils.logger import ad_logger -from ...utils.node_utils import get_all_input_output_nodes, is_op -from .._graph import add_graph_input, canonicalize_graph - - -def update_in_out_nodes(egm: GraphModule, cm: CachedSequenceInterface) -> None: - """Modify the graph module by adding new input nodes and canonicalizing the graph. - - The new input nodes correspond to the extra arguments needed for cached and flattened attention. - - Args: - egm: The graph module to analyze and modify. - cm: Cached sequence interface containing extra argument information. - """ - # loop through nodes to get input, output, and get_attr nodes - input_nodes, output_nodes = get_all_input_output_nodes(egm.graph) - - # we only expect one input node - assert len(input_nodes) == 2, "Expected exactly two input nodes (input_ids, position_ids)." - - # NOTE: for now, we wanna make sure we *only* return the final output and no hidden states. - # Later on, we can revisit how to support returning hidden states. - assert len(output_nodes) == 1, "Expected exactly one output node!" - assert len(output_nodes[0].all_input_nodes) == 1, "Expected to only return final tensor output!" - - ad_logger.info(f"Found {len(input_nodes)} input nodes and {len(output_nodes)} output nodes") - - # Activate and add extra argument nodes - new_args = cm.info.switch_to_cached_attn_inputs() - for name in new_args: - input_nodes.append(add_graph_input(egm, name)) - ad_logger.info(f"Added {len(new_args)} new input nodes for cached attention metadata") - - canonicalize_graph(egm) - - -def insert_cached_attention( - egm: GraphModule, - cm: CachedSequenceInterface, - attn_descriptor: Type[AttentionDescriptor], - cache_config: CacheConfig, -) -> None: - """Replace uncached source attention node with corresponding cached attn node.""" - # Get all attention nodes and their info objects - source_op = attn_descriptor.get_source_attention_op() - - # pick up graph - graph: Graph = egm.graph - - # look for relevant source attention nodes - source_attn_nodes = [n for n in graph.nodes if is_op(n, source_op)] - - if not source_attn_nodes: - # If there are no nodes for kv cache insertion found, return current graph - return - - # Sanity check - if cm.info.is_paged: - assert attn_descriptor.is_paged(), "Paged sequence info requires paged attention op." - - ad_logger.debug(f"Before inserting {attn_descriptor=} with cache: {egm}") - - # retrieve input nodes - input_nodes, _ = get_all_input_output_nodes(egm.graph) - - # insert metadata computation and extract each argument as a node - get_metadata, num_metadata = attn_descriptor.get_prepare_metadata_op() - with graph.inserting_before(input_nodes[-1].next): - ret_node = graph.call_function( - get_metadata, - args=( - *input_nodes, - cm.info.page_size, - ), - ) - metadata_nodes = [ - graph.call_function(operator.getitem, args=(ret_node, idx)) - for idx in range(num_metadata) - ] - - buffer_in_lookup: Dict[str, Node] = {} - - # replace fused attention node with attention node that has kv cache - num_cached_attn_replacements = 0 - for idx, attn_node in enumerate(source_attn_nodes): - # pick out GEMMs - qkv = attn_node.args[: attn_descriptor.get_num_qkv_args()] - - # setup + store cache initializers and caches as input nodes - cache_in_nodes = [] - for k, get_cache in attn_descriptor.get_cache_initializers(attn_node, cache_config).items(): - k_indexed = f"{k}_{idx}" - cm.add_cache(k_indexed, get_cache) - cache_in_nodes.append(add_graph_input(egm, k_indexed)) - - # setup + store global buffer initializers and buffers as input nodes - # NOTE: we have to check against existing keys to make sure nothing is registered twice... - buffer_in_nodes = [] - for k, get_buffer in attn_descriptor.get_global_buffer_initializers(attn_node).items(): - if k not in buffer_in_lookup: - cm.add_cache(k, get_buffer) - buffer_in_lookup[k] = add_graph_input(egm, k) - buffer_in_nodes.append(buffer_in_lookup[k]) # store buffer nodes for this op - - # retrieve constants for attention_op - constants = attn_descriptor.get_constants(attn_node) - - # insert cached attention replacement op - with graph.inserting_before(attn_node): - cached_attn_node = graph.call_function( - attn_descriptor.get_cached_attention_op(), - args=(*qkv, *metadata_nodes, *cache_in_nodes, *buffer_in_nodes, *constants), - ) - attn_node.replace_all_uses_with(cached_attn_node) - graph.erase_node(attn_node) - num_cached_attn_replacements += 1 - - canonicalize_graph(egm) - ad_logger.info( - f"Replaced {num_cached_attn_replacements} {source_op} ops " - f"with {attn_descriptor.get_cached_attention_op()}" - ) - ad_logger.debug(f"After inserting {attn_descriptor=} with cache: {egm}") - - -def resize_kv_cache( - egm: GraphModule, - cm: CachedSequenceInterface, - free_mem_ratio: float = 0.8, -) -> None: - """Inflate the kv cache to occupy the available GPU memory. - - free_mem_ratio specifies the fraction of available memory to occupy. - """ - - def _get_mem_info_in_mb(): - free_mem, total_mem = torch.cuda.mem_get_info() - return free_mem // 1024**2, total_mem // 1024**2 - - free_mem, total_mem = _get_mem_info_in_mb() - ad_logger.info(f"Free memory (MB): {free_mem}, Total memory (MB): {total_mem}") - current_cache_size = cm.current_cache_size_bytes() - current_num_pages = cm.info.num_pages - ad_logger.info( - f"Current cache size: {current_cache_size}, Current num pages: {current_num_pages}" - ) - - if free_mem_ratio == 0.0: - ad_logger.info(f"Skipping cache resize for {free_mem_ratio=}") - return - - try: - # Let's run a forward pass to get the memory usage - cm.info._set_max_num_tokens_sample() - free_mem_pre, _ = _get_mem_info_in_mb() - ad_logger.info(f"Free memory before forward pass (MB): {free_mem_pre}") - - egm(*cm.args) - - free_mem_post, _ = _get_mem_info_in_mb() - ad_logger.info(f"Free memory after forward pass (MB): {free_mem_post}") - - memory_for_forward_pass = free_mem_pre - free_mem_post - ad_logger.info(f"Memory for forward pass (MB): {memory_for_forward_pass}") - - new_cache_size = free_mem_post * 1024 * 1024 * free_mem_ratio + current_cache_size - new_num_pages = int(new_cache_size // (current_cache_size // current_num_pages)) - - # Need to sync all the GPUs - gathered_num_pages = [None] * get_world_size() - all_gather_object(gathered_num_pages, new_num_pages) - new_num_pages = min(gathered_num_pages) - ad_logger.info(f"After all_gather - new_num_pages: {new_num_pages}") - - cm.resize_cache(new_num_pages) - except Exception as e: - ad_logger.warning( - f"Error encountered while resizing kv cache: {e}.\nSkipping cache resize." - ) - - # Free memory - torch.cuda.empty_cache() diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/rms_norm.py b/tensorrt_llm/_torch/auto_deploy/transformations/library/rms_norm.py deleted file mode 100644 index a94758b181..0000000000 --- a/tensorrt_llm/_torch/auto_deploy/transformations/library/rms_norm.py +++ /dev/null @@ -1,113 +0,0 @@ -"""Graph transform to optimize RMSNorm execution using FlashInfer.""" - -from functools import partial - -import torch -from torch.fx import GraphModule - -from ...utils.logger import ad_logger - -# It is important to import ADPatternMatcherPass from pattern_matcher.py, not from torch._inductor.pattern_matcher -from ...utils.pattern_matcher import ADPatternMatcherPass, register_ad_pattern -from .._graph import canonicalize_graph - -_BACKEND_OPS = { - "flashinfer": torch.ops.auto_deploy.flashinfer_rms_norm, - "triton": torch.ops.auto_deploy.triton_rms_norm, - "torch": torch.ops.auto_deploy.torch_rmsnorm, -} - - -def _rms_norm_pattern(data: torch.Tensor, weight: torch.Tensor, eps: float) -> torch.Tensor: - """Implements the RMSNorm pattern for pattern matching. - - Args: - data: Input tensor to normalize. - weight: Scaling weights for the normalized output. - eps: Small constant for numerical stability. - - Returns: - Normalized and scaled tensor. - """ - input_dtype = data.dtype - data = data.to(torch.float32) - variance = data.pow(2).mean(-1, keepdim=True) - data = data * torch.rsqrt(variance + eps) - return weight * data.to(input_dtype) - - -def _rms_norm_replacement( - data: torch.Tensor, weight: torch.Tensor, eps: float, backend: str -) -> torch.Tensor: - """Backend-specific rms_norm implementation. - - Args: - data: Input tensor to normalize. - weight: Scaling weights for the normalized output. - eps: Small constant for numerical stability. - backend: Backend to use for RMSNorm computation ("flashinfer" or "triton"). - - Returns: - Normalized and scaled tensor using the specified backend implementation. - """ - - assert backend.lower() in _BACKEND_OPS, ( - f"Invalid {backend=}; must be one of {list(_BACKEND_OPS)}" - ) - return _BACKEND_OPS[backend.lower()](data, weight, eps) - - -def fuse_rmsnorm(gm: GraphModule, backend: str = "triton") -> None: - """Matches and replaces RMSNorm patterns in the graph with FlashInfer or Triton implementation. - - This function sets up pattern matching to identify RMSNorm operations in the graph - and replaces them with optimized implementations. It uses dummy tensors to register - the pattern matching rules. - - Args: - gm: Input graph module to transform. - backend: Backend to use for RMSNorm computation ("flashinfer" or "triton"). - - Returns: - Transformed graph module with optimized RMSNorm operations. - """ - if backend.lower() not in _BACKEND_OPS: - raise ValueError(f"Invalid backend, must be one of {list(_BACKEND_OPS)}, got {backend}") - ad_logger.info(f"Starting RMSNorm pattern matching with backend: {backend}") - - graph = gm.graph - patterns = ADPatternMatcherPass() - - # Create dummy tensors for pattern matching - bs = 2 - hidden_size = 512 - - def dummy_args(input_dtype: torch.dtype, weight_dtype: torch.dtype, eps: float = 1e-6): - return [ - torch.randn(bs, hidden_size, device="cuda", dtype=input_dtype), - torch.randn(hidden_size, device="cuda", dtype=weight_dtype), - eps, - ] - - # Define configurations for different data types - configs = [ - (torch.bfloat16, torch.bfloat16), - (torch.float16, torch.float16), - (torch.float32, torch.float32), - ] - - # Register patterns for each configuration - for input_dtype, weight_dtype in configs: - register_ad_pattern( - search_fn=_rms_norm_pattern, - replace_fn=partial(_rms_norm_replacement, backend=backend), - patterns=patterns, - dummy_args=dummy_args(input_dtype, weight_dtype), - op_ignore_types={}, - scalar_workaround={"eps": 1e-6}, - ) - - cnt = patterns.apply(graph) - ad_logger.info(f"RMSNorm pattern count: {cnt}") - canonicalize_graph(gm) - ad_logger.debug("RMSNorm pattern matching completed.") diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/transform.py b/tensorrt_llm/_torch/auto_deploy/transformations/transform.py index c841b4601f..931c8ec955 100644 --- a/tensorrt_llm/_torch/auto_deploy/transformations/transform.py +++ b/tensorrt_llm/_torch/auto_deploy/transformations/transform.py @@ -5,21 +5,11 @@ import gc import torch import torch.nn as nn -from ..compile import compile_and_capture from ..custom_ops.attention_interface import AttentionRegistry from ..llm_args import AutoDeployConfig from ..models.factory import ModelFactory from ..shim.interface import CachedSequenceInterface from ..transform.optimizer import InferenceOptimizer as ModularInferenceOptimizer -from ..utils.logger import ad_logger -from .library import ( - fuse_allreduce_residual_rmsnorm, - fuse_collectives, - fuse_rmsnorm, - insert_cached_attention, - resize_kv_cache, - update_in_out_nodes, -) class InferenceOptimizer: @@ -55,88 +45,60 @@ class InferenceOptimizer: self.ad_config.attn_backend ).get_attention_layout() - new_optimizer = ModularInferenceOptimizer(self.factory, self.ad_config.transforms) - - # TODO (hg): similar to above. - if "load_weights" in new_optimizer.config: - new_optimizer.config[ + if "load_weights" in self.ad_config.transforms: + self.ad_config.transforms[ "load_weights" ].checkpoint_device = self.ad_config.checkpoint_device - new_optimizer.config["load_weights"].device = cm.device + self.ad_config.transforms["load_weights"].device = cm.device + + if "resize_kv_cache" in self.ad_config.transforms: + self.ad_config.transforms[ + "resize_kv_cache" + ].free_mem_ratio = self.ad_config.free_mem_ratio + if "insert_cached_attention" in self.ad_config.transforms: + self.ad_config.transforms[ + "insert_cached_attention" + ].attn_backend = self.ad_config.attn_backend + if "insert_cached_mla_attention" in self.ad_config.transforms: + self.ad_config.transforms[ + "insert_cached_mla_attention" + ].attn_backend = self.ad_config.mla_backend + + # TODO: (hg)Missing MLA here. Figure out how to add MLA since duplicate transforms are not allowed. + # Old code: + # detect attention op and replace with cache-aware op + # for a_backend in [self.ad_config.attn_backend, self.ad_config.mla_backend]: + # attn_descriptor = AttentionRegistry.get(a_backend) + # insert_cached_attention(egm, cm, attn_descriptor, self.factory.get_cache_config()) + + if "compile_model" in self.ad_config.transforms: + self.ad_config.transforms[ + "compile_model" + ].cuda_graph_batch_sizes = self.ad_config.cuda_graph_batch_sizes + self.ad_config.transforms[ + "compile_model" + ].compile_backend = self.ad_config.compile_backend + + new_optimizer = ModularInferenceOptimizer(self.factory, self.ad_config.transforms) + # TODO: (hg) move this. let match_rope_layout and match_atten_layout use this shared config + new_optimizer.shared_config.attn_backend = self.ad_config.attn_backend egm = new_optimizer(cm) - # TODO (lucaslie): continue moving legacy transforms to the new optimizer - ############################################################################################ - # RUN POST-LOAD FUSION AND OPTIMIZATIONS - ############################################################################################ + # NOTE: (hg)Disabled visualization since compiled gm is a CapturedGraph instead of GraphModule. + # We can add a new stage in the optimizer to visualize the intermediate gm. + # if self.ad_config.visualize: + # try: + # from .library import visualize_namespace - # run MoE fusion - # TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/4674 this is causing OOMs - # fuse_moe(egm) - - # run GEMM fusion - # TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/4674 this is causing OOMs - # fuse_gemms(egm) - - # check if we can fuse allreduce, residual and rmsnorm - fuse_allreduce_residual_rmsnorm(egm) - - # check if we can fuse collectives - fuse_collectives(egm) - - # TODO (lucaslie): add backend selection as part of configurable inference optimizers - # check if we can fuse rmsnorm - fuse_rmsnorm(egm, "flashinfer") - - # visualize the final graph - if self.ad_config.visualize: - try: - from .library import visualize_namespace - - visualize_namespace(egm, args=cm.args, dynamic_shapes=cm.dynamic_shapes) - ad_logger.warning( - "Please run `pip install -r examples/auto_deploy/requirements.txt` to visualize" - " the graph." - ) - except ImportError: - pass - - ############################################################################################ - # SWITCH TO CACHED+FLATTENED ATTENTION + INITIALIZE CACHES - ############################################################################################ - - update_in_out_nodes(egm, cm) - - # detect attention op and replace with cache-aware op - for a_backend in [self.ad_config.attn_backend, self.ad_config.mla_backend]: - attn_descriptor = AttentionRegistry.get(a_backend) - insert_cached_attention(egm, cm, attn_descriptor, self.factory.get_cache_config()) - - # initialize cache on correct device - cm.initialize_caches() - - # resize kv cache to occupy the available GPU memory up to free_mem_ratio - resize_kv_cache(egm, cm, free_mem_ratio=self.ad_config.free_mem_ratio) - - ############################################################################################ - # COMPILE MODEL - ############################################################################################ - - cm.info.set_generate_only_batch() - compiler_kwargs = { - "cuda_graph_batch_sizes": self.ad_config.cuda_graph_batch_sizes, - "num_batched_inputs": 2, # TODO (lucaslie): improve once we have a config system... - } - egm_compiled = compile_and_capture( - egm, - self.ad_config.compile_backend, - args=cm.args, - dynamic_shapes=cm.dynamic_shapes, - compiler_kwargs=compiler_kwargs, - ) - cm.info.reset() + # visualize_namespace(egm, args=cm.args, dynamic_shapes=cm.dynamic_shapes) + # ad_logger.warning( + # "Please run `pip install -r examples/auto_deploy/requirements.txt` to visualize" + # " the graph." + # ) + # except ImportError: + # pass torch.cuda.empty_cache() gc.collect() - return egm_compiled + return egm diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py index c81ca0ae1c..58d22302f2 100644 --- a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py +++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py @@ -8,9 +8,7 @@ from torch.export import export from tensorrt_llm._torch.auto_deploy.distributed import common as dist from tensorrt_llm._torch.auto_deploy.distributed.trtllm import is_trtllm_op_available from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm -from tensorrt_llm._torch.auto_deploy.transformations.library.collectives import ( - fuse_allreduce_residual_rmsnorm, -) +from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op from tensorrt_llm.llmapi.mpi_session import MpiPoolSession @@ -65,14 +63,21 @@ def _test_allreduce_fusion(port: int): original_outputs, residual_original = gm(x, residual) # Fuse ops - fuse_allreduce_residual_rmsnorm(gm) + gm_transformed = InferenceOptimizer( + None, + { + "fuse_allreduce_residual_rmsnorm": { + "stage": "post_load_fusion", + }, + }, + )(None, gm) # Run the fused graph - fused_outputs, residual_fused = gm(x, residual) + fused_outputs, residual_fused = gm_transformed(x, residual) # Check if fused node in the graph has_fused_node = False - for node in gm.graph.nodes: + for node in gm_transformed.graph.nodes: if is_op(node, torch.ops.dist.fused_allreduce_residual_rmsnorm): has_fused_node = True assert has_fused_node, "Fused node not found." @@ -86,8 +91,8 @@ def _test_allreduce_fusion(port: int): ) # check if we can still export the model as expected - export(gm, args=args) - torch_export_to_gm(gm, args=args) + export(gm_transformed, args=args) + torch_export_to_gm(gm_transformed, args=args) @pytest.mark.parametrize("device_count", get_device_counts()) diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_collective_fusion.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_collective_fusion.py index 4aa1a875c4..ed3b98f281 100644 --- a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_collective_fusion.py +++ b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_collective_fusion.py @@ -8,12 +8,13 @@ import torch import torch.nn as nn import torch.nn.functional as F from _dist_test_utils import get_device_counts -from _graph_test_helpers import run_test +from _graph_test_helpers import run_test_transformed_gm from _torch_test_utils import fp8_compatible import tensorrt_llm._torch.auto_deploy.distributed.common as dist_common from tensorrt_llm._torch.auto_deploy.custom_ops.quant import FP8Linear -from tensorrt_llm._torch.auto_deploy.transformations.library import fuse_collectives +from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm +from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op @@ -61,11 +62,21 @@ def _run_job( is_op(n, torch.ops.auto_deploy.torch_dist_all_reduce) for n in gm.graph.nodes ) + gm = torch_export_to_gm(model, args=(x,), clone=True) + gm_transformed = InferenceOptimizer( + None, + { + "fuse_collectives": { + "stage": "post_load_fusion", + }, + }, + )(None, gm) + # now run the test - run_test( + run_test_transformed_gm( model, x, - transform=fuse_collectives, + gm_transformed, check_transformed_graph=check_transformed_graph, _get_expected_num_params=_get_expected_num_params, test_load_hook=False, diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rmsnorm.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rmsnorm.py index be2f9d52af..691aad78c5 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rmsnorm.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rmsnorm.py @@ -1,12 +1,11 @@ -from functools import partial - import pytest import torch -from _graph_test_helpers import run_test +from _graph_test_helpers import run_test_transformed_gm from torch.export import Dim from tensorrt_llm._torch.auto_deploy.custom_ops.rms_norm import * # noqa -from tensorrt_llm._torch.auto_deploy.transformations.library.rms_norm import fuse_rmsnorm +from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm +from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op @@ -52,15 +51,28 @@ def test_rmsnorm_fusion(eps, variant, op): return any(is_op(n, op) for n in gm.graph.nodes) model = TestModel(eps) - gm_transformed = run_test( + x = torch.randn(2, 1024, device="cuda", dtype=torch.float16) + dynamic_shapes = {0: Dim("batch_size", max=8)} + gm = torch_export_to_gm(model, args=(x,), dynamic_shapes=(dynamic_shapes,), clone=True) + gm_transformed = InferenceOptimizer( + None, + { + "fuse_rmsnorm": { + "stage": "post_load_fusion", + "backend": variant, + }, + }, + )(None, gm) + + run_test_transformed_gm( model, - torch.randn(2, 1024, device="cuda", dtype=torch.float16), - partial(fuse_rmsnorm, backend=variant), + x, + gm_transformed, checker, lambda num_p_og: num_p_og, - dynamic_shapes={0: Dim("batch_size", max=8)}, + dynamic_shapes=dynamic_shapes, ) - print(gm_transformed.graph) + new_input = torch.randn(4, 1024, device="cuda", dtype=torch.float16) y_transformed = gm_transformed(new_input) y_model = model(new_input) diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_gemm_fusion.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_gemm_fusion.py index 82a5104503..b99862fdc1 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_gemm_fusion.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_gemm_fusion.py @@ -7,11 +7,12 @@ import pytest import torch import torch.nn as nn import torch.nn.functional as F -from _graph_test_helpers import count_buffers, run_test +from _graph_test_helpers import count_buffers, run_test_transformed_gm from _torch_test_utils import all_close, fp8_compatible, reset_parameters from tensorrt_llm._torch.auto_deploy.custom_ops.quant import FP8Linear -from tensorrt_llm._torch.auto_deploy.transformations.library import fuse_gemms +from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm +from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_linear_op torch.manual_seed(0) @@ -254,10 +255,20 @@ def test_fusion(get_model: Callable[[], TestModel], dtype: str): buffer_size_before = count_buffers(model) - gm_transformed = run_test( + gm = torch_export_to_gm(model, args=(x,), clone=True) + gm_transformed = InferenceOptimizer( + None, + { + "fuse_gemms": { + "stage": "post_load_fusion", + }, + }, + )(None, gm) + + run_test_transformed_gm( model, x, - fuse_gemms, + gm_transformed, lambda gm: sum(is_linear_op(n, include_quantization=True) for n in gm.graph.nodes) == model.num_gemms_after_fusion, lambda num_p_og: num_p_og, # unchanged since fusing doesn't change param count diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_kv_cache.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_kv_cache.py index f2fd32ea3e..9266027e11 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_kv_cache.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_kv_cache.py @@ -2,19 +2,35 @@ from typing import Optional import pytest import torch -from _graph_test_helpers import FakeFactory +from _graph_test_helpers import SequenceEmbeddingInfo from _model_test_utils import GQA from _torch_test_utils import all_close -from tensorrt_llm._torch.auto_deploy.custom_ops.attention_interface import CacheConfig, SequenceInfo -from tensorrt_llm._torch.auto_deploy.custom_ops.flashinfer_attention import FlashInferAttention -from tensorrt_llm._torch.auto_deploy.custom_ops.triton_attention import TritonAttention +from tensorrt_llm._torch.auto_deploy.custom_ops.attention_interface import CacheConfig from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm +from tensorrt_llm._torch.auto_deploy.models.factory import ModelFactory from tensorrt_llm._torch.auto_deploy.shim.interface import CachedSequenceInterface -from tensorrt_llm._torch.auto_deploy.transform.interface import InferenceOptimizerConfig from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer -from tensorrt_llm._torch.auto_deploy.transformations.library import update_in_out_nodes -from tensorrt_llm._torch.auto_deploy.transformations.library.kvcache import insert_cached_attention + + +class DummyFactory(ModelFactory): + """Dummy factory to pass cache_config for testing.""" + + def __init__(self, model, cache_config): + self._model = model + self.cache_config = cache_config + + def build_model(self, device: str): + return self._model.to(device=device) + + def _build_model(self, device: str): + return + + def _load_checkpoint(self, model, device): + return + + def get_cache_config(self): + return self.cache_config # Class that uses SDPA directly instead of the regular attention mechanism @@ -68,42 +84,6 @@ class GQAWithSdpa(GQA): return self.o_proj(attn_output) -def _get_optimizer_config() -> InferenceOptimizerConfig: - return { - "build_model": { - "stage": "factory", - "device": "cuda", - "run_graph_cleanup": False, - "requires_clean_graph": False, - }, - "export_to_gm": { - "stage": "export", - "strict": False, - "clone_state_dict": True, - "run_graph_cleanup": False, - "requires_clean_graph": False, - }, - "cleanup_input_constraints": { - "stage": "post_export", - }, - } - - -class SequenceEmbeddingInfo(SequenceInfo): - hidden_size: int - dtype: torch.dtype - - def set_example_sequence(self) -> None: - super().set_example_sequence() - # set input ids to a 3D tensor (actually input embeddings) - self.input_ids = torch.rand( - *self.input_ids.shape, - self.hidden_size, - device=self.input_ids.device, - dtype=self.dtype, - ) - - # TODO (lucaslie): consider rewriting this test with a custom InferenceOptimizer config @pytest.mark.parametrize( "dtype", @@ -111,8 +91,8 @@ class SequenceEmbeddingInfo(SequenceInfo): ids=["float16", "float32"], ) @pytest.mark.parametrize( - "attn_descriptor", - [TritonAttention, FlashInferAttention], + "attn_backend", + ["triton", "flashinfer"], ids=["triton", "flashinfer"], ) @pytest.mark.parametrize( @@ -125,10 +105,10 @@ class SequenceEmbeddingInfo(SequenceInfo): ids=["regular", "gqa", "mqa"], ) @torch.inference_mode() -def test_sdpa_with_kv_cache(dtype, attn_descriptor, gqa_config): +def test_sdpa_with_kv_cache(dtype, attn_backend, gqa_config): """Test the SDPA transformation with KV cache.""" # flashinfer doesn't support float32 data type - if attn_descriptor == FlashInferAttention and dtype == torch.float32: + if attn_backend == "flashinfer" and dtype == torch.float32: pytest.skip("flashinfer doesn't support float32 data type") # Unpack the GQA configuration @@ -157,7 +137,6 @@ def test_sdpa_with_kv_cache(dtype, attn_descriptor, gqa_config): hidden_size, num_key_value_heads, ).to(dtype=dtype, device="cuda") - factory = FakeFactory(model) # Create input tensor and position_ids x = torch.rand(batch_size, seq_len, hidden_size).to(device="cuda", dtype=dtype) @@ -166,21 +145,37 @@ def test_sdpa_with_kv_cache(dtype, attn_descriptor, gqa_config): # Get the model's regular output y_model = model(x, position_ids) # b, s, d - # run modular inference optimizer up to post_export - optimizer = InferenceOptimizer(factory, _get_optimizer_config()) # type: ignore + # Apply the transformation + optimizer = InferenceOptimizer( + DummyFactory(model, CacheConfig()), + { + "build_model": { + "stage": "factory", + "device": "cuda", + "run_graph_cleanup": False, + "requires_clean_graph": False, + }, + "export_to_gm": { + "stage": "export", + "strict": False, + "clone_state_dict": True, + "run_graph_cleanup": False, + "requires_clean_graph": False, + }, + "cleanup_input_constraints": { + "stage": "post_export", + }, + "update_in_out_nodes": { + "stage": "cache_init", + }, + "insert_cached_attention": { + "stage": "cache_init", + "attn_backend": attn_backend, + }, + }, + ) # type: ignore gm = optimizer(cm) - y_gm = gm(x, position_ids) - assert all_close(y_model, y_gm, atol=atol, rtol=rtol) - - # Set up cache configuration - cache_config = CacheConfig() - - # Get input node(s) - update_in_out_nodes(gm, cm) - - # Apply the transformation - insert_cached_attention(gm, cm, attn_descriptor=attn_descriptor, cache_config=cache_config) gm.to("cuda") cm.initialize_caches() From f7c597ec407cbf45a0f614ae0356541cf13d2be4 Mon Sep 17 00:00:00 2001 From: Daniel Stokes <40156487+djns99@users.noreply.github.com> Date: Fri, 22 Aug 2025 09:08:03 +1200 Subject: [PATCH 15/33] [None][perf] Make finalize fusion part of the tactic selection logic (#6915) Signed-off-by: djns99 <40156487+djns99@users.noreply.github.com> --- .../mixtureOfExpertsBackendBenchmarkFixture.h | 21 +- ...ixtureOfExpertsBackendBenchmarkLauncher.cu | 250 ++++-------------- .../include/cutlass_extensions/gemm_configs.h | 14 +- .../include/moe_gemm_kernels.h | 17 +- .../cutlass_kernels/include/moe_kernels.h | 36 ++- .../int8_gemm/int8_gemm_template.h | 1 - .../moe_gemm/moe_gemm_template_dispatch.h | 37 ++- .../cutlass_kernels/moe_gemm/moe_kernels.cu | 55 ++-- ...llm_internal_cutlass_kernels_static.tar.xz | 4 +- .../aarch64-linux-gnu/version.txt | 4 +- ...llm_internal_cutlass_kernels_static.tar.xz | 4 +- .../x86_64-linux-gnu/version.txt | 4 +- .../mixtureOfExpertsPlugin.cpp | 6 +- .../mixtureOfExperts/mixtureOfExpertsPlugin.h | 1 + cpp/tensorrt_llm/thop/moeOp.cpp | 29 +- .../kernels/mixtureOfExpertsTest.cu | 56 ++-- tensorrt_llm/_torch/autotuner.py | 3 +- .../_torch/custom_ops/torch_custom_ops.py | 44 +-- .../custom_ops/trtllm_gen_custom_ops.py | 35 +-- tests/unittest/_torch/misc/test_autotuner.py | 13 +- 20 files changed, 263 insertions(+), 371 deletions(-) diff --git a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h index 2559ae5484..36cbe76544 100644 --- a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h +++ b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h @@ -833,7 +833,7 @@ public: // Runs for 3 iterations or 1 second and picks the best option int pickBestTactic(MOEParallelismConfig parallelism_config, GemmToProfile gemm_to_profile) { - auto tactics = mMoERunner.getTactics(); + auto tactics = mMoERunner.getTactics(static_cast(gemm_to_profile)); ::nvtx3::scoped_range nvtx(tensorrt_llm::common::nvtx::nextColor(), "Tactic Profiling GEMM " + std::to_string(static_cast(gemm_to_profile))); // We save space by reusing the same workspace buffer for all tactics when doing full layer profiling. So we @@ -925,12 +925,14 @@ public: std::pair setTactic( int tactic_idx1, int tactic_idx2, MOEParallelismConfig parallelism_config, GemmToProfile gemm_to_profile) { - auto tactics = mMoERunner.getTactics(); + auto tactics1 = mMoERunner.getTactics(MoeGemmId::GEMM_1); + auto tactics2 = mMoERunner.getTactics(MoeGemmId::GEMM_2); std::vector, GemmToProfile>> tactics_to_profile{ {tactic_idx1, GemmToProfile::GEMM_1}, {tactic_idx2, GemmToProfile::GEMM_2}}; for (auto& combo : tactics_to_profile) { auto& t = combo.first.get(); + auto& tactics = combo.second == GemmToProfile::GEMM_1 ? tactics1 : tactics2; if (combo.second != gemm_to_profile && gemm_to_profile != GemmToProfile::LAYER) { t = 0; // Unneeded tactic, set to 0 @@ -947,7 +949,7 @@ public: } } - mMoERunner.setTactic(tactics[tactic_idx1], tactics[tactic_idx2]); + mMoERunner.setTactic(tactics1[tactic_idx1], tactics2[tactic_idx2]); mBestTacticGemm1 = tactic_idx1; mBestTacticGemm2 = tactic_idx2; return {tactic_idx1, tactic_idx2}; @@ -965,7 +967,7 @@ public: auto expert_weights_size = gemm_to_profile == GemmToProfile::GEMM_1 ? mExpertWeight1Size : mExpertWeight2Size; - auto tactics = mMoERunner.getTactics()[tactic_idx]; + auto tactics = mMoERunner.getTactics(static_cast(gemm_to_profile))[tactic_idx]; if (static_cast(gemm_to_profile) != static_cast(mGemmProfilerBackend.mGemmToProfile)) { throw std::runtime_error("Configuration mismatch between mGemmProfilerBackend and runMoEPermute"); @@ -1074,11 +1076,12 @@ void MixtureOfExpertsBenchmark::runBenchmark(benchmark::State& state } if (LOG_LEVEL >= INFO) { - auto tactics = mMoERunner.getTactics(); - std::cout << "Selected tactic #1: " << tactic_idx1 << "/" << tactics.size() << "\n" - << tactics[tactic_idx1].toString() << std::endl; - std::cout << "Selected tactic #2: " << tactic_idx2 << "/" << tactics.size() << "\n" - << tactics[tactic_idx2].toString() << std::endl; + auto tactics1 = mMoERunner.getTactics(MoeGemmId::GEMM_1); + auto tactics2 = mMoERunner.getTactics(MoeGemmId::GEMM_2); + std::cout << "Selected tactic #1: " << tactic_idx1 << "/" << tactics1.size() << "\n" + << tactics1[tactic_idx1].toString() << std::endl; + std::cout << "Selected tactic #2: " << tactic_idx2 << "/" << tactics2.size() << "\n" + << tactics2[tactic_idx2].toString() << std::endl; } state.counters["tactic_idx1"] = tactic_idx1; state.counters["tactic_idx2"] = tactic_idx2; diff --git a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu index b784c6d0bc..8e18694ad7 100644 --- a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu +++ b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu @@ -42,148 +42,15 @@ struct WeightParams ->Apply(argGen>>) template -auto listAllTactics() +auto listAllTactics(MoeGemmId gemm_id) { int const sm = getSMVersion(); using RunnerType = decltype(BenchClass::mMoERunner); - return RunnerType::getTactics(sm); + return RunnerType::getTactics(sm, gemm_id); } template -int parseTacticToId(nlohmann::json tactic_config) -{ - bool is_tma_warp_specialized = tactic_config.at("is_tma_warp_specialized").get(); - int tile_shape_id = -1; - std::array tile_shape; - if (tactic_config.at("tile_shape").is_array()) - tactic_config.at("tile_shape").get_to(tile_shape); - else - tile_shape_id = tactic_config.at("tile_shape").get(); - - std::vector confs = listAllTactics(); - - try - { - for (int i = 0; i < confs.size(); i++) - { - auto const& c = confs[i]; - if (c.is_tma_warp_specialized != is_tma_warp_specialized) - continue; - - if (!is_tma_warp_specialized) - { - int stages = tactic_config.at("stages").get(); - if (c.stages != stages) - continue; - } - - if (tile_shape_id != -1) - { - int comp = c.getTileConfigAsInt(); - if (tile_shape_id != comp) - continue; - if (is_tma_warp_specialized && (int) c.cluster_shape != tactic_config.at("cluster_shape").get()) - continue; - - // Found matching config - return i; - } - - // Handle if the user provided a shape instead of the enum value - if (is_tma_warp_specialized) - { - // TODO Add cases for blackwell shapes - using Kv = uint64_t; - constexpr static auto K = [](int m, int n) { return (uint64_t(m) << 32) | uint64_t(n); }; - static std::unordered_map const tile_map{ - {K(64, 16), CutlassTileConfigSM90::CtaShape64x16x128B}, - {K(64, 32), CutlassTileConfigSM90::CtaShape64x32x128B}, - {K(64, 64), CutlassTileConfigSM90::CtaShape64x64x128B}, - {K(64, 128), CutlassTileConfigSM90::CtaShape64x128x128B}, - {K(64, 256), CutlassTileConfigSM90::CtaShape64x256x128B}, - - {K(128, 16), CutlassTileConfigSM90::CtaShape128x16x128B}, - {K(128, 32), CutlassTileConfigSM90::CtaShape128x32x128B}, - {K(128, 64), CutlassTileConfigSM90::CtaShape128x64x128B}, - {K(128, 128), CutlassTileConfigSM90::CtaShape128x128x128B}, - {K(128, 256), CutlassTileConfigSM90::CtaShape128x256x128B}, - {K(256, 128), CutlassTileConfigSM90::CtaShape256x128x128B}, - }; - - if (c.getTileConfigAsInt() != (int) tile_map.at(K(tile_shape[0], tile_shape[1]))) - continue; - - static std::unordered_map const cluster_map{ - // CTA configs for M=64 - {K(1, 1), ClusterShape::ClusterShape_1x1x1}, - {K(2, 1), ClusterShape::ClusterShape_2x1x1}, - {K(1, 2), ClusterShape::ClusterShape_1x2x1}, - {K(2, 2), ClusterShape::ClusterShape_2x2x1}, - }; - - std::array cluster_shape; - tactic_config.at("cluster_shape").get_to(cluster_shape); - - if (c.cluster_shape != cluster_map.at(K(cluster_shape[0], cluster_shape[1]))) - continue; - - // Found matching config - return i; - } - else - { - std::array warp_shape; - tactic_config.at("warp_shape").get_to(warp_shape); - - using Kv = uint64_t; - constexpr static auto K = [](std::array a, std::array b) - { - uint64_t sum = 0; - for (auto v : a) - sum = sum * 512 + v; - for (auto v : b) - sum = sum * 256 + v; - return sum; - }; - static std::unordered_map tile_map{ - {K({128, 128, 8}, {64, 64, 8}), CutlassTileConfig::CtaShape128x128x8_WarpShape64x64x8}, - - {K({16, 128, 64}, {16, 32, 64}), CutlassTileConfig::CtaShape16x128x64_WarpShape16x32x64}, - {K({32, 128, 64}, {32, 32, 64}), CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64}, - - {K({64, 128, 64}, {32, 64, 64}), CutlassTileConfig::CtaShape64x128x64_WarpShape32x64x64}, - {K({64, 64, 128}, {32, 64, 64}), CutlassTileConfig::CtaShape64x64x128_WarpShape32x64x64}, - {K({64, 128, 64}, {64, 32, 64}), CutlassTileConfig::CtaShape64x128x64_WarpShape64x32x64}, - - {K({128, 64, 64}, {64, 32, 64}), CutlassTileConfig::CtaShape128x64x64_WarpShape64x32x64}, - {K({128, 128, 64}, {64, 32, 64}), CutlassTileConfig::CtaShape128x128x64_WarpShape64x32x64}, - {K({128, 128, 64}, {64, 64, 64}), CutlassTileConfig::CtaShape128x128x64_WarpShape64x64x64}, - {K({128, 128, 64}, {64, 32, 64}), CutlassTileConfig::CtaShape128x128x64_WarpShape128x32x64}, - {K({128, 256, 64}, {64, 64, 64}), CutlassTileConfig::CtaShape128x256x64_WarpShape64x64x64}, - - {K({256, 128, 64}, {64, 64, 64}), CutlassTileConfig::CtaShape256x128x64_WarpShape64x64x64}, - - {K({16, 256, 64}, {16, 64, 64}), CutlassTileConfig::CtaShape16x256x64_WarpShape16x64x64} - - }; - if (c.tile_config_sm80 != tile_map.at(K(tile_shape, warp_shape))) - continue; - - // Found matching config - return i; - } - } - } - catch (std::out_of_range const& e) - { - std::cerr << "Warning: error parsing tactic " << tactic_config.dump(2) << std::endl; - } - - return -1; -} - -template -void parseTacticToVectorID(nlohmann::json& tactic, std::vector& tactic_ids) +void parseTacticToVectorID(nlohmann::json& tactic, std::vector& tactic_ids, MoeGemmId gemm_id) { if (tactic.is_number_integer()) { @@ -193,20 +60,16 @@ void parseTacticToVectorID(nlohmann::json& tactic, std::vector& tactic_ids) { for (auto c : tactic) { - parseTacticToVectorID(c, tactic_ids); + parseTacticToVectorID(c, tactic_ids, gemm_id); } } - else if (tactic.is_object()) - { - tactic_ids.push_back(parseTacticToId(tactic)); - } else if (tactic.is_string()) { assert(tactic.is_string()); auto tactic_name = tactic.get(); if (tactic_name == "all") { - auto all_tactics = listAllTactics(); + auto all_tactics = listAllTactics(gemm_id); tactic_ids.resize(all_tactics.size()); std::iota(tactic_ids.begin(), tactic_ids.end(), 0); } @@ -410,39 +273,15 @@ void argGenLoadFile(benchmark::internal::Benchmark* benchmark) } // Do this after filtering datatypes as tactics only make sense if we know the data type - bool has_tactic_ids2 = false; std::vector tactic_ids1{}; std::vector tactic_ids2{}; - if (run_config.contains("tactic_id1") || run_config.contains("tactic_id2")) + if (run_config.contains("tactic_id1")) { - if (run_config.contains("tactic_id")) - { - throw std::invalid_argument("Cannot use tactic_id and tactic_idX"); - } - has_tactic_ids2 = true; - parseTacticToVectorID(run_config["tactic_id1"], tactic_ids1); - parseTacticToVectorID(run_config["tactic_id2"], tactic_ids2); + parseTacticToVectorID(run_config["tactic_id1"], tactic_ids1, MoeGemmId::GEMM_1); } - else + if (run_config.contains("tactic_id2")) { - parseTacticToVectorID(run_config["tactic_id"], tactic_ids1); - has_tactic_ids2 = false; - tactic_ids2.resize(1); // Dummy value so we loop exactly once below - } - if (tactic_ids1.empty() || tactic_ids2.empty()) - { - std::cerr << "Warning: Skipping benchmark, no valid tactic found" << std::endl; - static bool printed = false; - if (!printed) - { - printed = true; - std::cerr << __PRETTY_FUNCTION__ << ": Valid Tactics are:\n"; - auto confs = listAllTactics(); - for (auto c : confs) - std::cerr << c.toString(); - } - - continue; + parseTacticToVectorID(run_config["tactic_id2"], tactic_ids2, MoeGemmId::GEMM_2); } auto get_or = [&](auto name, auto def) @@ -478,8 +317,6 @@ void argGenLoadFile(benchmark::internal::Benchmark* benchmark) } else if (gemm_to_profile == (int) GemmToProfile::GEMM_2) { - if (!has_tactic_ids2) - tactic_ids2 = std::move(tactic_ids1); tactic_ids1 = {-1}; } } @@ -494,14 +331,31 @@ void argGenLoadFile(benchmark::internal::Benchmark* benchmark) return val; }; + if (tactic_ids1.empty() || tactic_ids2.empty()) + { + std::cerr << "Warning: Skipping benchmark, no valid tactic found" << std::endl; + static bool printed = false; + if (!printed) + { + printed = true; + std::cerr << __PRETTY_FUNCTION__ << ": Valid Tactics are:\n"; + for (auto gemm_id : {MoeGemmId::GEMM_1, MoeGemmId::GEMM_2}) + { + std::cerr << "GEMM " << (int) gemm_id << ":\n"; + auto confs = listAllTactics(gemm_id); + for (auto c : confs) + std::cerr << c.toString(); + std::cerr << std::endl; + } + } + + continue; + } + for (auto t1 : tactic_ids1) { - // tactic_ids2 will have one dummy value if has_tactic_ids2 = false for (auto t2 : tactic_ids2) { - if (!has_tactic_ids2) - t2 = t1; - benchmark->Args({num_experts, // get_range("k"), // get_range("hidden_size"), // @@ -531,7 +385,7 @@ void argGenHardcoded(benchmark::internal::Benchmark* benchmark) // {ActivationType::Relu, ActivationType::Gelu, // ActivationType::Silu, ActivationType::Geglu, // ActivationType::Swiglu}; - auto cutlass_tactic = {-1}; // {0,..., listAllTactics().size()}; + auto cutlass_tactic = {-1}; // {0,..., listAllTactics(MoeGemmId).size()}; auto routing_config = {LOAD_BALANCED_ROUTING_CONFIG}; // {0, 1, 2}; for (auto num_expert : num_experts) @@ -558,14 +412,18 @@ void argGen(benchmark::internal::Benchmark* benchmark) { if (LOG_LEVEL >= VERBOSE) { - std::cout << "List of all tactics for dtype " << (int) BenchClass::toDTypeID() << ":\n"; - int i = 0; - for (auto& t : listAllTactics()) + std::cout << "== List of all tactics for dtype " << (int) BenchClass::toDTypeID() << " ==\n"; + for (auto gemm_id : {MoeGemmId::GEMM_1, MoeGemmId::GEMM_2}) { - std::cout << "Tactic " << i << ":\n"; - std::cout << t.toString() << std::endl; + int i = 0; + std::cout << "=== GEMM " << (int) gemm_id << " ===\n"; + for (auto& t : listAllTactics(gemm_id)) + { + std::cout << "==== Tactic " << i << " ====\n"; + std::cout << t.toString() << std::endl; - i++; + i++; + } } } @@ -652,7 +510,6 @@ void help() " \"bias\": int, (optional)\n" " \"do_final_scale\": int, (optional)\n" " \"act_fn\": int,\n" - " \"tactic_id\": tactic, (see below)\n" " \"tactic_id1\": tactic, (see below)\n" " \"tactic_id2\": tactic, (see below)\n" " \"dtypes\": [string, ...], (optional)\n" @@ -676,27 +533,14 @@ void help() "- \"do_final_scale\" - If final scales should be applied, 0 = no scale, 1 = scale\n" "- \"act_fn\" - The activation function to use, 0 = identity, 1 = relu, 2 = gelu, 3 = silu, 4 = geglu, 5 = " "swiglu\n" - "- \"tactic_id, tactic_id1, tactic_id2\"\n" - "The config for the CUTLASS GEMM. tactic_id sets the same tactic for both to the same tactic (except in " - "auto mode)\n" - "Use tactic_idX to set the tactic for the corresponding GEMM" + "- \"tactic_id1, tactic_id2\"\n" + "The config for the CUTLASS GEMM. tactic_idX sets the tactic for the corresponding GEMM" "Valid tactics are:\n" - " - An object:\n" - " {\n" - " \"is_tma_warp_specialized\": bool,\n" - " \"tile_shape\": [int, int, int] or int,\n" - " \"cluster_shape\": [int, int, int] or int, (required for sm90, type must be an int if tile_shape " - "is " - "an int)\n" - " \"warp_shape\": [int, int, int], (required for non-sm90 if tile_shape is an array)\n" - " \"stages\": int, (required for non-sm90)\n" - " },\n" - " - An integer: corresponds to an index in the tactics array. WARNING this is not stable between test " - "configurations\n" - " - An array: of integers or objects, forms a list of tactics to sweep\n" + " - An integer: corresponds to an index in the tactics array. WARNING this is not stable between data types " + "or GPU architectures\n" + " - An array: of integers, forms a list of tactics to sweep\n" " - The string \"all\": This will sweep through all possible tactics\n" - " - The string \"auto\": This runs a short benchmark to pick the fastest tactic before each benchmark " - "case. " + " - The string \"auto\": This runs a short benchmark to pick the fastest tactic before each benchmark case. " "Useful for quick perf tests, prefer a full sweep and manually setting the tactic for more accurate " "results" "- dtypes - A list of dtypes to run this config through.\n" diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h index fe75687e36..a8b13f353a 100644 --- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h +++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h @@ -433,6 +433,14 @@ struct CutlassGemmConfig int sm_version = 80; // Use 80 as a catch all for <90 bool is_tma_warp_specialized = false; + enum class EpilogueFusionType : int + { + NONE, + FINALIZE + }; + + EpilogueFusionType epilogue_fusion_type = EpilogueFusionType::NONE; + CutlassGemmConfig() = default; CutlassGemmConfig(CutlassTileConfig tile_config, SplitKStyle split_k_style, int split_k_factor, int stages) @@ -502,7 +510,8 @@ struct CutlassGemmConfig << "\n\tsm: " << sm_version << "\n\ttile shape ID: " << getTileConfigAsInt() << "\n\tcluster shape ID: " << (int) cluster_shape << "\n\tmainloop sched: " << (int) mainloop_schedule << "\n\tepi sched: " << (int) epilogue_schedule - << "\n\tenable cuda kernel: " << (enableCudaKernel ? "true" : "false"); + << "\n\tenable cuda kernel: " << (enableCudaKernel ? "true" : "false") + << "\n\tepilogue fusion type: " << (int) epilogue_fusion_type; } else if (tile_config_sm80 != tensorrt_llm::cutlass_extensions::CutlassTileConfig::ChooseWithHeuristic) { @@ -534,7 +543,8 @@ inline std::ostream& operator<<(std::ostream& out, CutlassGemmConfig const& conf << ", mainloop_schedule_enum: " << int(config.mainloop_schedule) << ", epilogue_schedule_enum: " << int(config.epilogue_schedule) << ", cluster_shape_enum: " << int(config.cluster_shape) - << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false"); + << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false") + << ", epilogue_fusion_type: " << int(config.epilogue_fusion_type); } else { diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h index 3c814851c9..16b7838ed6 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h @@ -289,15 +289,20 @@ public: void moeGemm(GroupedGemmInput inputs, TmaWarpSpecializedGroupedGemmInput hopper_inputs); - std::vector getConfigs() const; - static std::vector getConfigs(int sm); - static std::vector getTmaWarpSpecializedConfigs(int sm); - static std::vector getBlackwellConfigs(int sm); - static std::vector getHopperConfigs(int sm); + std::vector getConfigs(bool supports_finalize_fusion) const; + static std::vector getConfigs(int sm, bool supports_finalize_fusion); + static std::vector getTmaWarpSpecializedConfigs( + int sm, bool supports_finalize_fusion); static std::vector getAmpereConfigs(int sm); [[nodiscard]] bool isTmaWarpSpecialized(cutlass_extensions::CutlassGemmConfig gemm_config) const; - [[nodiscard]] bool supportsTmaWarpSpecialized() const; + + [[nodiscard]] bool supportsTmaWarpSpecialized() const + { + return supportsTmaWarpSpecialized(sm_); + } + + [[nodiscard]] static bool supportsTmaWarpSpecialized(int sm); [[nodiscard]] bool isFusedGatedActivation(cutlass_extensions::CutlassGemmConfig gemm_config, ActivationType activation_type, int gemm_n, int gemm_k) const; [[nodiscard]] bool supportsFusedGatedActivation(ActivationType activation_type, int gemm_n, int gemm_k) const; diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h index 7d592bed0e..389591e7fe 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h @@ -228,6 +228,13 @@ struct MOEParallelismConfig } }; +enum class MoeGemmId : int +{ + Undefined = 0, + GEMM_1, + GEMM_2 +}; + struct QuantParams { // Int weight only quantization params @@ -446,7 +453,7 @@ public: virtual void setTactic(std::optional gemm1_config, std::optional gemm2_config) = 0; - virtual std::vector getTactics() = 0; + virtual std::vector getTactics(MoeGemmId gemm_id) = 0; virtual void runMoe(void const* input_activations, void const* input_sf, bool const swizzled_input_sf, int const* token_selected_experts, float const* token_final_scales, void const* fc1_expert_weights, @@ -593,15 +600,15 @@ public: gemm2_config_ = std::move(gemm2_config); } - std::vector getTactics() override + std::vector getTactics(MoeGemmId gemm_id) override { - return moe_gemm_runner_.getConfigs(); + return moe_gemm_runner_.getConfigs(gemm_id == MoeGemmId::GEMM_2 && mayHaveFinalizeFused()); } - static std::vector getTactics(int sm) + static std::vector getTactics(int sm, MoeGemmId gemm_id) { using RunnerType = decltype(moe_gemm_runner_); - return RunnerType::getConfigs(sm); + return RunnerType::getConfigs(sm, gemm_id == MoeGemmId::GEMM_2 && Self::mayHaveFinalizeFused(sm)); } void runMoe(void const* input_activations, void const* input_sf, bool const swizzled_input_sf, @@ -798,6 +805,12 @@ private: && !use_w4_groupwise; } + static bool mayHaveFinalizeFused(int sm) + { + using RunnerType = decltype(moe_gemm_runner_); + return RunnerType::supportsTmaWarpSpecialized(sm) && sm >= 90 && !use_w4_groupwise; + } + // TODO: This should eventually take the quant params to give more flexibility static auto getScalingType() { @@ -895,12 +908,7 @@ struct GemmProfilerBackend { public: using Config = cutlass_extensions::CutlassGemmConfig; - enum class GemmToProfile - { - Undefined = 0, - GEMM_1, - GEMM_2 - }; + using GemmToProfile = MoeGemmId; void init(CutlassMoeFCRunnerInterface& runner, GemmToProfile gemm_to_profile, nvinfer1::DataType dtype, nvinfer1::DataType wtype, nvinfer1::DataType otype, int num_experts, int k, int64_t hidden_size, @@ -951,7 +959,6 @@ public: CutlassMoeFCRunnerInterface* mInterface; GemmToProfile mGemmToProfile = GemmToProfile::Undefined; - std::vector mAllTacticsSaved; int mSM{}; int64_t mNumExperts{}; int64_t mNumExpertsPerNode{}; @@ -972,7 +979,7 @@ public: // This will be a unique value for every iteration of warmup and actual bench constexpr static int64_t NUM_ROUTING_SAMPLES = 16; - std::array mTmaInputCache; + std::array, NUM_ROUTING_SAMPLES> mTmaInputCache; QuantParams mQuantParams; bool mBias{}; @@ -985,7 +992,8 @@ public: private: void prepareRouting(int num_tokens, char* workspace, cudaStream_t stream); void prepareQuantParams(int num_tokens, char* workspace, cudaStream_t stream); - void prepareTmaWsInputs(int num_tokens, char* workspace, void const* expert_weights, cudaStream_t stream); + void prepareTmaWsInputs(int num_tokens, char* workspace, void const* expert_weights, + TmaWarpSpecializedGroupedGemmInput::EpilogueFusion fusion, cudaStream_t stream); }; // Populates a buffer with random values for use with MOE benchmarking diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h index c44caae0fa..ef06abceee 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h @@ -57,7 +57,6 @@ namespace kernels { namespace cutlass_kernels { - template void genericInt8GemmKernelLauncher(int8_t const* A, int8_t const* B, tk::QuantMode quantOption, float const* alphaCol, float const* alphaRow, T* C, int m, int n, int k, tkc::CutlassGemmConfig gemmConfig, char* workspace, diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h index 56a8299f18..0b009f6099 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h @@ -471,17 +471,18 @@ void dispatchMoeGemmToCutlass(GroupedGemmInput -std::vector -MoeGemmRunner::getConfigs() const +std::vector MoeGemmRunner::getConfigs( + bool supports_finalize_fusion) const { - return getConfigs(sm_); + return getConfigs(sm_, supports_finalize_fusion); } template std::vector MoeGemmRunner::getConfigs( - int sm) + int sm, bool supports_finalize_fusion) { - std::vector candidate_configs = getTmaWarpSpecializedConfigs(sm); + std::vector candidate_configs + = getTmaWarpSpecializedConfigs(sm, supports_finalize_fusion); std::vector ampere_configs = getAmpereConfigs(sm); std::copy(ampere_configs.begin(), ampere_configs.end(), std::back_inserter(candidate_configs)); return candidate_configs; @@ -517,7 +518,8 @@ MoeGemmRunner::getAmpereConfigs(int sm template std::vector -MoeGemmRunner::getTmaWarpSpecializedConfigs(int sm) +MoeGemmRunner::getTmaWarpSpecializedConfigs( + int sm, bool supports_finalize_fusion) { using tensorrt_llm::cutlass_extensions::CutlassGemmConfig; static constexpr auto weight_only_flag @@ -554,6 +556,17 @@ MoeGemmRunner::getTmaWarpSpecializedCo std::vector tma_ws_configs = kernels::cutlass_kernels::get_candidate_configs(sm, max_split_k, config_type_param); + if (supports_finalize_fusion) + { + // Duplicate the configs and set the epilogue fusion type to FINALIZE + auto finalize_configs = tma_ws_configs; + std::transform(finalize_configs.begin(), finalize_configs.end(), std::back_inserter(tma_ws_configs), + [](auto& config) + { + config.epilogue_fusion_type = cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE; + return config; + }); + } return tma_ws_configs; } @@ -566,11 +579,11 @@ bool MoeGemmRunner::isTmaWarpSpecializ } template -bool MoeGemmRunner::supportsTmaWarpSpecialized() const +bool MoeGemmRunner::supportsTmaWarpSpecialized(int sm) { - return (sm_ == 90 && kernels::cutlass_kernels::isValidHopperMOESpecialisation()) - || (sm_ >= 100 && sm_ < 120 && kernels::cutlass_kernels::isValidBlackwellMOESpecialisation()) - || ((sm_ == 120 || sm_ == 121) && kernels::cutlass_kernels::isValidSM120MOESpecialisation()); + return (sm == 90 && kernels::cutlass_kernels::isValidHopperMOESpecialisation()) + || (sm >= 100 && sm < 120 && kernels::cutlass_kernels::isValidBlackwellMOESpecialisation()) + || ((sm == 120 || sm == 121) && kernels::cutlass_kernels::isValidSM120MOESpecialisation()); } template @@ -815,7 +828,9 @@ size_t MoeGemmRunner::calcMaxWorkspace if constexpr (kernels::cutlass_kernels::isValidTmaWarpSpecializedMOESpecialisation() && !use_w4afp8 && !use_wfp4a16) { - auto configs = getTmaWarpSpecializedConfigs(sm_); + // Finalize fusion may not actually be supported by the kernel, + // if they are not we will catch the error and skip them + auto configs = getTmaWarpSpecializedConfigs(sm_, true); auto fpX_block_scaling_type = TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::NONE; if constexpr (use_wfp4afp4) { diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu index 730840717c..ef70b9d45e 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu @@ -2847,9 +2847,10 @@ void CutlassMoeFCRunnerepilogue_fusion_type == cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE; permuted_token_final_scales_ - = (gemm2_using_tma_ws && mayHaveFinalizeFused()) ? getWsPtr(float{}, "permuted_token_final_scales") : nullptr; + = gemm2_using_finalize_fusion ? getWsPtr(float{}, "permuted_token_final_scales") : nullptr; bool const is_gated_activation = isGatedActivation(activation_type); bool const gemm1_using_fused_moe @@ -4006,8 +4007,12 @@ CutlassMoeFCRunner:: bool apply_bias = parallelism_config.tp_rank == 0; auto* fc2_bias = apply_bias ? fc2_expert_biases : nullptr; + bool gemm2_using_finalize_fusion = gemm2_config_->epilogue_fusion_type + == cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE; bool using_fused_finalize - = use_fused_finalize_ && gemm2_config_->sm_version >= 90 && !use_w4_groupwise && !use_lora; + = use_fused_finalize_ && gemm2_using_finalize_fusion && !use_w4_groupwise && !use_lora; + TLLM_CHECK_WITH_INFO(using_fused_finalize == gemm2_using_finalize_fusion, + "GEMM2 tactic requests finalize fusion, but the runner is not configured to use it"); if (using_fused_finalize) { assert(min_latency_mode == false); @@ -4550,14 +4555,26 @@ void GemmProfilerBackend::prepareQuantParams(int num_tokens, char* workspace_ptr } } -void GemmProfilerBackend::prepareTmaWsInputs( - int num_tokens, char* workspace_ptr_char, void const* expert_weights, cudaStream_t stream) +void GemmProfilerBackend::prepareTmaWsInputs(int num_tokens, char* workspace_ptr_char, void const* expert_weights, + TmaWarpSpecializedGroupedGemmInput::EpilogueFusion fusion, cudaStream_t stream) { if (mSM < 90) { return; } + bool use_w4afp8 = (mDType == nvinfer1::DataType::kFP8 && mWType == nvinfer1::DataType::kINT4); + bool use_wfp4a16 = ((mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16) + && mWType == nvinfer1::DataType::kUINT8); + bool use_w4_groupwise = use_w4afp8 || use_wfp4a16; + bool const use_finalize_fusion = fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE; + bool const finalize_fusion_not_supported = !mInterface->use_fused_finalize_ || mMinLatencyMode || use_w4_groupwise + || mGemmToProfile != GemmToProfile::GEMM_2; + if (use_finalize_fusion && finalize_fusion_not_supported) + { + return; + } + auto workspaces = getProfilerWorkspaces(num_tokens, mSM >= 90); #define GET_WS_PTR(type, name) \ @@ -4596,28 +4613,24 @@ void GemmProfilerBackend::prepareTmaWsInputs( size_t num_expanded_tokens = num_tokens * mK; for (int64_t i = 0; i < NUM_ROUTING_SAMPLES; i++) { - mTmaInputCache[i].configureWorkspace(tma_ws_input_workspace, mNumExpertsPerNode, gemm_workspace, + // Note: Even though we have separate TMA WS inputs for finalize fusion on/off we reuse the same pointers to + // save space. + auto& cache_element = mTmaInputCache[i][use_finalize_fusion]; + cache_element.configureWorkspace(tma_ws_input_workspace, mNumExpertsPerNode, gemm_workspace, workspaces.at("gemm_workspace").first, mScalingType); tma_ws_input_workspace += tma_ws_size; int64_t* expert_first_token_offset = expert_first_token_offset_base + i * (mNumExpertsPerNode + 1); int* permuted_row_to_unpermuted_row = permuted_row_to_unpermuted_row_base + i * num_expanded_tokens; - auto& gemm1_tma_ws_input = mGemmToProfile == GemmToProfile::GEMM_1 ? mTmaInputCache[i] : dummy_tma_ws_input; - auto& gemm2_tma_ws_input = mGemmToProfile == GemmToProfile::GEMM_2 ? mTmaInputCache[i] : dummy_tma_ws_input; + auto& gemm1_tma_ws_input = mGemmToProfile == GemmToProfile::GEMM_1 ? cache_element : dummy_tma_ws_input; + auto& gemm2_tma_ws_input = mGemmToProfile == GemmToProfile::GEMM_2 ? cache_element : dummy_tma_ws_input; if (mSM >= 90) { /* GEMM1 */ gemm1_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE; gemm2_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE; - - bool use_w4afp8 = (mDType == nvinfer1::DataType::kFP8 && mWType == nvinfer1::DataType::kINT4); - bool use_wfp4a16 = ((mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16) - && mWType == nvinfer1::DataType::kUINT8); - bool use_w4_groupwise = use_w4afp8 || use_wfp4a16; - bool using_fused_finalize - = mInterface->use_fused_finalize_ && mSM >= 90 && !mMinLatencyMode && !use_w4_groupwise; - if (using_fused_finalize) + if (use_finalize_fusion) { assert(!mMinLatencyMode); gemm2_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE; @@ -4652,7 +4665,6 @@ void GemmProfilerBackend::prepareTmaWsInputs( void GemmProfilerBackend::prepare( int num_tokens, char* workspace_ptr_char, void const* expert_weights, cudaStream_t stream) { - mAllTacticsSaved = mInterface->getTactics(); mSampleIndex = 0; auto workspace_size = getWorkspaceSize(num_tokens); @@ -4660,7 +4672,10 @@ void GemmProfilerBackend::prepare( prepareRouting(num_tokens, workspace_ptr_char, stream); prepareQuantParams(num_tokens, workspace_ptr_char, stream); - prepareTmaWsInputs(num_tokens, workspace_ptr_char, expert_weights, stream); + prepareTmaWsInputs(num_tokens, workspace_ptr_char, expert_weights, + TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE, stream); + prepareTmaWsInputs(num_tokens, workspace_ptr_char, expert_weights, + TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE, stream); } size_t GemmProfilerBackend::getWorkspaceSize(int maxM) @@ -4724,7 +4739,9 @@ void GemmProfilerBackend::runProfiler(int original_num_tokens, Config const& tac TmaWarpSpecializedGroupedGemmInput tma_ws_input_template; if (tactic.is_tma_warp_specialized) { - tma_ws_input_template = mTmaInputCache[mSampleIndex]; + tma_ws_input_template = mTmaInputCache[mSampleIndex][tactic.epilogue_fusion_type + == cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE]; + TLLM_CHECK_WITH_INFO(tma_ws_input_template.isValid(), "TMA WS input template is not initialized"); } mInterface->is_profiler = true; diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz index 08cd9b6f66..5ebd5f7ebe 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:86586b9f6845e91e8ba0accad53a5a3418c50d8fd30ad49fa8837470c72b5dcf -size 67051604 +oid sha256:d6a3f6adef11003f794a6cec1235d0c622ead71b4e801a89866e91dfd91bb30c +size 67053244 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt index 8b500f5c97..b93f46ea6d 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt @@ -1,2 +1,2 @@ -568cb6ca2413c93b0f5839dd05577c0c57bc4b5f2359366c79d0ace665de4bd6 libtensorrt_llm_internal_cutlass_kernels_static.a -commit 9c0a42825905952beaf9b35d5a35d58de1a123fa +317a25037093a6f3d156ffa58a68bce53071ef68dacdcb04cc0aaeea80b64e76 libtensorrt_llm_internal_cutlass_kernels_static.a +commit 444ef1b3b06cdc7ee66b4e612ce26ad25967440b diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz index f1a6b9dc88..bd07528460 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6489751f16a4dadf42664738ded03fbbd60195619f2d5f80af8190554318257d -size 66872936 +oid sha256:489fb557b78062efedd1514f2995fafb9216bb0e0068a550e86763efb9d5eee9 +size 66874608 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt index 4af58b0800..3c053c1a91 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt @@ -1,2 +1,2 @@ -813c237a565664b2acf2313f0e436f66f24deeb16a84d273dc007af55795e55f libtensorrt_llm_internal_cutlass_kernels_static.a -commit 9c0a42825905952beaf9b35d5a35d58de1a123fa +5a31acd0fb1415196bff71fa4a8d1dded147e15ea10821cc46c85684c66986ee libtensorrt_llm_internal_cutlass_kernels_static.a +commit 444ef1b3b06cdc7ee66b4e612ce26ad25967440b diff --git a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp index 189e23b8ac..59d92e6429 100644 --- a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp @@ -946,8 +946,8 @@ int MixtureOfExpertsPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, std::optional gemm2; if (common::getEnvForceDeterministicMOE()) { - gemm1 = mMOERunner->getTactics()[0]; - gemm2 = mMOERunner->getTactics()[0]; + gemm1 = mMOERunner->getTactics(MoeGemmId::GEMM_1)[0]; + gemm2 = mMOERunner->getTactics(MoeGemmId::GEMM_2)[0]; } else { @@ -1278,7 +1278,7 @@ void MixtureOfExpertsGemmProfiler::runTactic(int m, int n, int k, MixtureOfExper auto MixtureOfExpertsGemmProfiler::getTactics(int m, int n, int k) const -> std::vector { assert(mRunner); - return mRunner->mMOERunner->getTactics(); + return mRunner->mMOERunner->getTactics(backend.mGemmToProfile); } void MixtureOfExpertsGemmProfiler::initTmpData( diff --git a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.h b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.h index cd3aaf52c2..feb1f10cdc 100644 --- a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.h +++ b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.h @@ -43,6 +43,7 @@ namespace kernels = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE; using MoeMinLatencyParams = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::MoeMinLatencyParams; using MOEParallelismConfig = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::MOEParallelismConfig; using QuantParams = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::QuantParams; +using MoeGemmId = CUTLASS_MOE_GEMM_NAMESPACE::MoeGemmId; using ActivationType = CUTLASS_MOE_GEMM_NAMESPACE::ActivationType; using ActivationParams = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::ActivationParams; using TmaWarpSpecializedGroupedGemmInput = CUTLASS_MOE_GEMM_NAMESPACE::TmaWarpSpecializedGroupedGemmInput; diff --git a/cpp/tensorrt_llm/thop/moeOp.cpp b/cpp/tensorrt_llm/thop/moeOp.cpp index 328cce3d01..abeba273a8 100644 --- a/cpp/tensorrt_llm/thop/moeOp.cpp +++ b/cpp/tensorrt_llm/thop/moeOp.cpp @@ -48,6 +48,7 @@ namespace common = tensorrt_llm::common; namespace kernels = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE; using ActivationParams = CUTLASS_MOE_GEMM_NAMESPACE::ActivationParams; using ActivationType = CUTLASS_MOE_GEMM_NAMESPACE::ActivationType; +using MoeGemmId = CUTLASS_MOE_GEMM_NAMESPACE::MoeGemmId; // Always use public header as it is just utility functions and types using TmaWarpSpecializedGroupedGemmInput = tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput; using profiler_backend = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::GemmProfilerBackend; @@ -215,7 +216,8 @@ public: mKernelRunner->use_fused_finalize_ = mUseFusedFinalize; mProfiler = std::make_shared(); - mAllProfiles = mKernelRunner->getTactics(); + mGemm1Profiles = mKernelRunner->getTactics(MoeGemmId::GEMM_1); + mGemm2Profiles = mKernelRunner->getTactics(MoeGemmId::GEMM_2); } ~FusedMoeRunner() @@ -585,10 +587,11 @@ public: return std::make_tuple(output, num_active_experts_per_node, experts_to_token_score, active_expert_global_ids); } - int64_t getTacticNum() + int64_t getTacticNum(int64_t const gemm_idx) { std::lock_guard lock(mMutex); - return mAllProfiles.size(); + TORCH_CHECK(gemm_idx == 1 || gemm_idx == 2, "gemm_idx must be 1 or 2"); + return (gemm_idx == 1) ? mGemm1Profiles.size() : mGemm2Profiles.size(); } // TODO Update this to be able to tell if we are profiling swiglu bias @@ -624,10 +627,14 @@ public: : group_size_; int const num_experts = static_cast(fc2_expert_weights.sizes()[0] * ep_size); + auto const gemm_to_profile + = (gemm_idx == 1) ? profiler_backend::GemmToProfile::GEMM_1 : profiler_backend::GemmToProfile::GEMM_2; + auto const& profiles = (gemm_idx == 1) ? mGemm1Profiles : mGemm2Profiles; + // Get specific profile configs according to the profile_id. // Fallback tactic is set to be 0 // TODO: use the best tactic id found offline for a better default inference perf - auto const& profile = profile_id == -1 ? mAllProfiles.front() : mAllProfiles[profile_id]; + auto const& profile = profile_id == -1 ? profiles.front() : profiles[profile_id]; auto stream = at::cuda::getCurrentCUDAStream(input.get_device()); @@ -638,8 +645,7 @@ public: if (do_preparation) { // Set profiled gemm idx - mProfiler->mGemmToProfile - = (gemm_idx == 1) ? profiler_backend::GemmToProfile::GEMM_1 : profiler_backend::GemmToProfile::GEMM_2; + mProfiler->mGemmToProfile = gemm_to_profile; // mProfiler init auto parallelism_config = kernels::MOEParallelismConfig(static_cast(tp_size), @@ -704,7 +710,8 @@ private: bool mUseFusedFinalize = true; using Profile = tensorrt_llm::cutlass_extensions::CutlassGemmConfig; - std::vector mAllProfiles; + std::vector mGemm1Profiles; + std::vector mGemm2Profiles; void freeProfileWorkspace() { @@ -730,15 +737,15 @@ private: return; } - auto best_gemm1_profile = mAllProfiles.front(); - auto best_gemm2_profile = mAllProfiles.front(); + auto best_gemm1_profile = mGemm1Profiles.front(); + auto best_gemm2_profile = mGemm2Profiles.front(); if (profile_ids.has_value()) { TORCH_CHECK(profile_ids.value().size() == 2, "Expecting 2 profile ids"); best_gemm1_profile - = profile_ids.value()[0] == -1 ? best_gemm1_profile : mAllProfiles.at(profile_ids.value()[0]); + = profile_ids.value()[0] == -1 ? best_gemm1_profile : mGemm1Profiles.at(profile_ids.value()[0]); best_gemm2_profile - = profile_ids.value()[1] == -1 ? best_gemm2_profile : mAllProfiles.at(profile_ids.value()[1]); + = profile_ids.value()[1] == -1 ? best_gemm2_profile : mGemm2Profiles.at(profile_ids.value()[1]); } mKernelRunner->setTactic(best_gemm1_profile, best_gemm2_profile); } diff --git a/cpp/tests/unit_tests/kernels/mixtureOfExpertsTest.cu b/cpp/tests/unit_tests/kernels/mixtureOfExpertsTest.cu index 6f2ce0f93e..11ae4273dc 100644 --- a/cpp/tests/unit_tests/kernels/mixtureOfExpertsTest.cu +++ b/cpp/tests/unit_tests/kernels/mixtureOfExpertsTest.cu @@ -370,8 +370,8 @@ protected: float mSparseMixerEpsilon = 0.2f; - // Default this to true. This only matters for K>2, and so by doing this we will test the fused and unfused paths - bool mUseDeterministicHopperReduce = true; + // Default this to false. This only matters for K>2, and so by doing this we will test the fused and unfused paths + bool mUseFusedFinalize = false; // Disable this for long running tests to speed up runtime bool mIsLongTest = false; @@ -456,7 +456,7 @@ protected: { managed_buffers.clear(); - mMoERunner.use_fused_finalize_ = k < 3 || !mUseDeterministicHopperReduce; + mMoERunner.use_fused_finalize_ = k < 3 || mUseFusedFinalize; mHiddenSize = hidden_size; mInterSize = hidden_size * mInterSizeFraction; @@ -1087,9 +1087,9 @@ protected: return std::tuple{(void*) weight_1, (void*) weight_2, bias_1, bias2_ptr, scale_1, scale_2, scale_3}; } - auto getFilteredConfigs(int sm) + auto getFilteredConfigs(int sm, MoeGemmId gemm_id) { - auto tactics = mMoERunner.getTactics(); + auto tactics = mMoERunner.getTactics(gemm_id); if (sm == 89 || sm >= 120) { // Filter some unsupported configs for L40S @@ -1120,17 +1120,27 @@ protected: auto selectTacticsForArch(int sm) { bool is_tma_warp_specialized = sm >= 90 && !INT_QUANT; - auto tactics = getFilteredConfigs(sm); - auto it = std::find_if(tactics.begin(), tactics.end(), + auto epilogue_fusion_type = (is_tma_warp_specialized && mUseFusedFinalize) + ? tensorrt_llm::cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE + : tensorrt_llm::cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::NONE; + auto tactics1 = getFilteredConfigs(sm, MoeGemmId::GEMM_1); + auto tactics2 = getFilteredConfigs(sm, MoeGemmId::GEMM_2); + auto it1 = std::find_if(tactics1.begin(), tactics1.end(), [is_tma_warp_specialized](auto& c) { return c.is_tma_warp_specialized == is_tma_warp_specialized; }); - if (it == tactics.end()) + auto it2 = std::find_if(tactics2.begin(), tactics2.end(), + [is_tma_warp_specialized, epilogue_fusion_type](auto& c) { + return c.is_tma_warp_specialized == is_tma_warp_specialized + && c.epilogue_fusion_type == epilogue_fusion_type; + }); + if (it1 == tactics1.end() || it2 == tactics2.end()) { // Fall back to any tactic std::cout << "WARNING: Could not find config for sm version " << sm << std::endl; - return std::pair{tactics[0], tactics[0]}; + it1 = (it1 == tactics1.end()) ? tactics1.begin() : it1; + it2 = (it2 == tactics2.end()) ? tactics2.begin() : it2; } - return std::pair(*it, *it); + return std::pair(*it1, *it2); } using ConfigsToTestVec = std::vectorget(); auto tactic1 = mInternalSelectedConfig1; auto tactic2 = mInternalSelectedConfig2; - if (!tactic1) + if (!tactic1 || !tactic2) { int sm = getSMVersion(); std::tie(tactic1, tactic2) = selectTacticsForArch(sm); @@ -1629,8 +1639,9 @@ void MixtureOfExpertsTest::BasicPermuteTest( auto [expected_experts, token_final_scales] = populateRouting(num_experts, num_tokens, k); runMoEPermute(hidden_input, expected_experts, token_final_scales, hidden_size, num_experts, k); - bool should_be_deterministic - = mUseDeterministicHopperReduce || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120; + bool is_finalize_fusion = gemm2.epilogue_fusion_type + == tensorrt_llm::cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE; + bool should_be_deterministic = !is_finalize_fusion || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120; if (should_be_deterministic && !mIsLongTest) { auto first_iter = getDataFromDevice(mFinalOutput, mTotalTokens * mHiddenSize); @@ -1749,7 +1760,7 @@ TYPED_TEST(MixtureOfExpertsTest, PermuteSwigluBias) TYPED_TEST(MixtureOfExpertsTest, PermuteNonDeterministic) { - this->mUseDeterministicHopperReduce = false; + this->mUseFusedFinalize = true; // Just test case 3, cases 1&2 always use the fused paths this->BasicPermuteTest(3); } @@ -1896,8 +1907,10 @@ void MixtureOfExpertsTest::ParallelismTest( // Only need to init the inputs on the first iteration runMoEPermute(hidden_input, expected_experts, token_final_scales, hidden_size, num_experts, k, MOEParallelismConfig{tp_size, i, ep_size, j}, enable_alltoall); + bool is_finalize_fusion = gemm2.epilogue_fusion_type + == tensorrt_llm::cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE; bool should_be_deterministic - = mUseDeterministicHopperReduce || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120; + = !is_finalize_fusion || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120; if (should_be_deterministic && !mIsLongTest) { auto first_iter = getDataFromDevice(mFinalOutput, mTotalTokens * mHiddenSize); @@ -1912,8 +1925,10 @@ void MixtureOfExpertsTest::ParallelismTest( else { runMoEPermute(MOEParallelismConfig{tp_size, i, ep_size, j}, enable_alltoall); + bool is_finalize_fusion = gemm2.epilogue_fusion_type + == tensorrt_llm::cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE; bool should_be_deterministic - = mUseDeterministicHopperReduce || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120; + = !is_finalize_fusion || mK < 3 || getSMVersion() < 90 || getSMVersion() >= 120; if (should_be_deterministic && !mIsLongTest) { auto first_iter = getDataFromDevice(mFinalOutput, mTotalTokens * mHiddenSize); @@ -2077,6 +2092,7 @@ PARALLEL_TEST_SUITE(MixedParallel) TYPED_TEST(MixtureOfExpertsTest, ConfigSweep) { this->mIsLongTest = true; + this->mUseFusedFinalize = true; // True for all cases because we sweep both auto genConfigName = [](auto conf) -> std::string { using namespace tensorrt_llm::cutlass_extensions; @@ -2103,12 +2119,13 @@ TYPED_TEST(MixtureOfExpertsTest, ConfigSweep) auto activation_pool = std::vector{ActivationType::Relu, ActivationType::Swiglu, ActivationType::SwigluBias}; if (this->NVFP4) activation_pool = {ActivationType::Relu}; - auto configs = this->getFilteredConfigs(getSMVersion()); + auto configs1 = this->getFilteredConfigs(getSMVersion(), MoeGemmId::GEMM_1); + auto configs2 = this->getFilteredConfigs(getSMVersion(), MoeGemmId::GEMM_2); for (auto const activation_type : activation_pool) { - for (auto conf1 : configs) + for (auto conf1 : configs1) { - for (auto conf2 : configs) + for (auto conf2 : configs2) { auto name1 = genConfigName(conf1); auto name2 = genConfigName(conf2); @@ -2120,7 +2137,6 @@ TYPED_TEST(MixtureOfExpertsTest, ConfigSweep) this->mActType = activation_type; for (auto k : {2, 3}) { - this->mOverrideSelectedConfig1 = conf1; this->mOverrideSelectedConfig2 = conf2; this->BasicPermuteTest(k, this->MINIMUM_ALIGNMENT); diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py index da4df91f69..aa1b250b3a 100644 --- a/tensorrt_llm/_torch/autotuner.py +++ b/tensorrt_llm/_torch/autotuner.py @@ -453,7 +453,8 @@ class AutoTuner: p.name for p in inspect.signature(runner.forward).parameters.values() } - valid_tactics = runner.get_valid_tactics(input_tensors, profile) + valid_tactics = runner.get_valid_tactics(input_tensors, profile, + **kwargs) if "do_preparation" in runner_arg_names and len(valid_tactics) > 0: runner( input_tensors, diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py index bd946343b0..7d0c73364d 100644 --- a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py @@ -81,12 +81,9 @@ class MoERunner(TunableRunner): use_fused_finalize) self.fused_moe_runner = MoERunner.runner_dict[instance_key] - def get_valid_tactics( - self, - inputs: List[torch.Tensor], - profile: OptimizationProfile, - ) -> List[int]: - return range(self.fused_moe_runner.get_tactic_num()) + def get_valid_tactics(self, inputs: List[torch.Tensor], + profile: OptimizationProfile, **kwargs) -> List[int]: + return range(self.fused_moe_runner.get_tactic_num(kwargs["gemm_idx"])) def forward( self, @@ -318,11 +315,8 @@ class FP8RowwiseGemmRunner(TunableRunner): self.fp8_rowwise_gemm_runner = FP8RowwiseGemmRunner.runner_dict[ instance_key] - def get_valid_tactics( - self, - inputs: List[torch.Tensor], - profile: OptimizationProfile, - ) -> List[int]: + def get_valid_tactics(self, inputs: List[torch.Tensor], + profile: OptimizationProfile, **kwargs) -> List[int]: return list(range(self.fp8_rowwise_gemm_runner.get_num_configs())) def forward( @@ -403,11 +397,8 @@ class FP4GemmRunner(TunableRunner): output_dtype, int(fp4_gemm_type)) self.fp4_gemm_runner = FP4GemmRunner.runner_dict[instance_key] - def get_valid_tactics( - self, - inputs: List[torch.Tensor], - profile: OptimizationProfile, - ) -> List[int]: + def get_valid_tactics(self, inputs: List[torch.Tensor], + profile: OptimizationProfile, **kwargs) -> List[int]: return list(range(self.fp4_gemm_runner.get_num_configs())) def forward( @@ -518,11 +509,8 @@ class FP8BatchedGemmRunner(TunableRunner): return out_tensors - def get_valid_tactics( - self, - inputs: List[torch.Tensor], - profile: OptimizationProfile, - ) -> List[int]: + def get_valid_tactics(self, inputs: List[torch.Tensor], + profile: OptimizationProfile, **kwargs) -> List[int]: mat1, mat2, _, _, _ = inputs @@ -735,11 +723,8 @@ class WeightOnlyQuantGemmRunner(TunableRunner): self.weight_only_quant_gemm_runner = WeightOnlyQuantGemmRunner.runner_dict[ instance_key] - def get_valid_tactics( - self, - inputs: List[torch.Tensor], - profile: OptimizationProfile, - ) -> List[int]: + def get_valid_tactics(self, inputs: List[torch.Tensor], + profile: OptimizationProfile, **kwargs) -> List[int]: return list(range(self.weight_only_quant_gemm_runner.get_num_configs())) def forward( @@ -813,11 +798,8 @@ class FinegrainedMixedDtypeGemm(TunableRunner): self._finegrained_mixed_dtype_gemm_runner = FinegrainedMixedDtypeGemm._runner_dict[ instance_key] - def get_valid_tactics( - self, - inputs: List[torch.Tensor], - profile: OptimizationProfile, - ) -> List[int]: + def get_valid_tactics(self, inputs: List[torch.Tensor], + profile: OptimizationProfile, **kwargs) -> List[int]: return list( range(self._finegrained_mixed_dtype_gemm_runner.get_num_configs())) diff --git a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py index 2bb780f6ef..bbee1b8102 100644 --- a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py @@ -122,11 +122,8 @@ class FP4BlockScaleMoERunner(TunableRunner): self.local_num_experts, self.routed_scaling_factor, self.routing_method_type, self.do_finalize, tactic) - def get_valid_tactics( - self, - inputs: List[torch.Tensor], - profile: OptimizationProfile, - ) -> List[int]: + def get_valid_tactics(self, inputs: List[torch.Tensor], + profile: OptimizationProfile, **kwargs) -> List[int]: args = FP4BlockScaleMoEInputs(*inputs) @@ -409,11 +406,8 @@ class FP8BlockScaleMoERunner(TunableRunner): self.local_expert_offset, self.local_num_experts, self.routed_scaling_factor, self.routing_method_type, tactic) - def get_valid_tactics( - self, - inputs: List[torch.Tensor], - profile: OptimizationProfile, - ) -> List[int]: + def get_valid_tactics(self, inputs: List[torch.Tensor], + profile: OptimizationProfile, **kwargs) -> List[int]: args = FP8BlockScaleMoEInputs(*inputs) @@ -670,11 +664,8 @@ class MxE4m3MxE2m1BlockScaleMoERunner(TunableRunner): self.local_expert_offset, self.local_num_experts, self.routed_scaling_factor, self.routing_method_type, tactic) - def get_valid_tactics( - self, - inputs: List[torch.Tensor], - profile: OptimizationProfile, - ) -> List[int]: + def get_valid_tactics(self, inputs: List[torch.Tensor], + profile: OptimizationProfile, **kwargs) -> List[int]: args = MxE4m3MxE2m1BlockScaleMoEInputs(*inputs) @@ -907,11 +898,8 @@ class E4m3MxE2m1BlockScaleMoERunner(TunableRunner): self.local_expert_offset, self.local_num_experts, self.routed_scaling_factor, self.routing_method_type, tactic) - def get_valid_tactics( - self, - inputs: List[torch.Tensor], - profile: OptimizationProfile, - ) -> List[int]: + def get_valid_tactics(self, inputs: List[torch.Tensor], + profile: OptimizationProfile, **kwargs) -> List[int]: args = E4m3MxE2m1BlockScaleMoEInputs(*inputs) @@ -1123,11 +1111,8 @@ class Bf16MxE2m1BlockScaleMoERunner(TunableRunner): self.local_num_experts, self.routed_scaling_factor, self.routing_method_type, tactic) - def get_valid_tactics( - self, - inputs: List[torch.Tensor], - profile: OptimizationProfile, - ) -> List[int]: + def get_valid_tactics(self, inputs: List[torch.Tensor], + profile: OptimizationProfile, **kwargs) -> List[int]: args = Bf16MxE2m1BlockScaleMoEInputs(*inputs) diff --git a/tests/unittest/_torch/misc/test_autotuner.py b/tests/unittest/_torch/misc/test_autotuner.py index c2f5c32141..5ed816df8d 100644 --- a/tests/unittest/_torch/misc/test_autotuner.py +++ b/tests/unittest/_torch/misc/test_autotuner.py @@ -151,7 +151,8 @@ def test_autotuner_try_block(): class PartialCrashedRunner(TunableRunner): def get_valid_tactics(self, inputs: List[FakeTensor], - profile: OptimizationProfile) -> List[int]: + profile: OptimizationProfile, + **kwargs) -> List[int]: return [-1, 0, 1] def forward(self, @@ -226,7 +227,7 @@ class GemmRunnerWithAttributes(TunableRunner): self.num_warps = num_warps def get_valid_tactics(self, inputs: List[FakeTensor], - profile: OptimizationProfile) -> List[int]: + profile: OptimizationProfile, **kwargs) -> List[int]: return [-1, 0, 1] def forward(self, @@ -313,11 +314,9 @@ def test_multiple_dynamic_shapes_cache(): class GemmRunnerWithTacticConfigs(TunableRunner): valid_tactic_ids = [-1, 0, 1] - def get_valid_tactics( - self, - inputs: List[FakeTensor], - profile: OptimizationProfile, - ) -> List[Dict[str, int]]: + def get_valid_tactics(self, inputs: List[FakeTensor], + profile: OptimizationProfile, + **kwargs) -> List[Dict[str, int]]: # The simulated delay is not deterministic, so we need to return specific tactics here return [{ "block_size": block_size, From 6f245ec78bfc611ad11032c8e9e36debf0a6b0d3 Mon Sep 17 00:00:00 2001 From: dominicshanshan <30051912+dominicshanshan@users.noreply.github.com> Date: Fri, 22 Aug 2025 09:25:15 +0800 Subject: [PATCH 16/33] [None][chore] Mass integration of release/1.0 (#6864) Signed-off-by: Stanley Sun <190317771+StanleySun639@users.noreply.github.com> Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com> Signed-off-by: ruodil <200874449+ruodil@users.noreply.github.com> Signed-off-by: Yiqing Yan Signed-off-by: Yanchao Lu Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> Signed-off-by: Bo Deng Signed-off-by: Chang Liu <9713593+chang-l@users.noreply.github.com> Signed-off-by: Stefan Niebler <82932102+stnie@users.noreply.github.com> Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com> Signed-off-by: qqiao Signed-off-by: yechank <161688079+yechank-nvidia@users.noreply.github.com> Signed-off-by: William Zhang <133824995+2ez4bz@users.noreply.github.com> Signed-off-by: raayandhar Co-authored-by: Stanley Sun <190317771+StanleySun639@users.noreply.github.com> Co-authored-by: ruodil <200874449+ruodil@users.noreply.github.com> Co-authored-by: Yiqing Yan Co-authored-by: Yanchao Lu Co-authored-by: brb-nv <169953907+brb-nv@users.noreply.github.com> Co-authored-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> Co-authored-by: Larry <197874197+LarryXFly@users.noreply.github.com> Co-authored-by: Bo Deng Co-authored-by: Guoming Zhang <137257613+nv-guomingz@users.noreply.github.com> Co-authored-by: Stefan Niebler <82932102+stnie@users.noreply.github.com> Co-authored-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> Co-authored-by: Yan Chunwei <328693+Superjomn@users.noreply.github.com> Co-authored-by: Emma Qiao Co-authored-by: Yechan Kim <161688079+yechank-nvidia@users.noreply.github.com> Co-authored-by: 2ez4bz <133824995+2ez4bz@users.noreply.github.com> Co-authored-by: Raayan Dhar <58057652+raayandhar@users.noreply.github.com> Co-authored-by: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com> --- cpp/tensorrt_llm/kernels/topkLastDim.cu | 4 +- examples/llm-api/star_attention.py | 9 +- .../_torch/models/modeling_gemma3vl.py | 7 +- tensorrt_llm/executor/proxy.py | 3 +- tensorrt_llm/executor/worker.py | 4 +- tensorrt_llm/llmapi/llm.py | 11 +- tensorrt_llm/tools/multimodal_builder.py | 14 +- .../accuracy/references/cnn_dailymail.yaml | 3 + .../defs/accuracy/references/gsm8k.yaml | 6 + .../defs/accuracy/references/mmlu.yaml | 3 + .../accuracy/test_disaggregated_serving.py | 136 ++++++++++++------ .../defs/accuracy/test_llm_api_pytorch.py | 21 +++ .../defs/disaggregated/test_disaggregated.py | 4 +- .../defs/llmapi/_run_llmapi_llm.py | 19 ++- .../defs/llmapi/test_llm_api_qa.py | 70 +++++++++ .../defs/perf/pytorch_model_config.py | 16 +-- tests/integration/defs/test_e2e.py | 103 +++++++++++++ .../test_lists/qa/llm_function_full.txt | 12 ++ .../test_lists/qa/llm_function_sanity.txt | 1 + .../test_lists/test-db/l0_dgx_h100.yml | 2 + .../test_lists/test-db/l0_h100.yml | 1 + tests/integration/test_lists/waives.txt | 1 + tests/unittest/llmapi/test_llm_pytorch.py | 38 +++++ .../inflight_batcher_llm/scripts/build.sh | 3 +- 24 files changed, 416 insertions(+), 75 deletions(-) create mode 100644 tests/integration/defs/llmapi/test_llm_api_qa.py diff --git a/cpp/tensorrt_llm/kernels/topkLastDim.cu b/cpp/tensorrt_llm/kernels/topkLastDim.cu index 2e9e0c9179..e6e4e82c92 100644 --- a/cpp/tensorrt_llm/kernels/topkLastDim.cu +++ b/cpp/tensorrt_llm/kernels/topkLastDim.cu @@ -1356,8 +1356,8 @@ void standalone_stable_radix_topk_(void* buf, size_t& buf_size, T const* in, Idx sort_in = static_cast(aligned_pointers[9]); sort_in_idx = static_cast(aligned_pointers[10]); } - cudaMemsetAsync( - buf, 0, static_cast(aligned_pointers[2]) - static_cast(aligned_pointers[0]), stream); + cudaMemsetAsync(aligned_pointers[0], 0, + static_cast(aligned_pointers[2]) - static_cast(aligned_pointers[0]), stream); } T const* in_buf = nullptr; diff --git a/examples/llm-api/star_attention.py b/examples/llm-api/star_attention.py index 367f7cc843..d87895e71a 100644 --- a/examples/llm-api/star_attention.py +++ b/examples/llm-api/star_attention.py @@ -7,8 +7,8 @@ from difflib import SequenceMatcher import torch from tensorrt_llm import LLM, SamplingParams +from tensorrt_llm.llmapi.llm_args import KvCacheConfig from tensorrt_llm.mapping import CpType -from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig def dump_jsonl(data, fname): @@ -54,11 +54,8 @@ def similarity_score(a, b): return SequenceMatcher(None, a, b).ratio() -# Generate the outputs using either TRT or PyTorch (based on the use_pytorch argument). It’s the same function for both workflows. def generate_llm_outputs(args, data, fp8=False, fp8_kv_cache=False): - quant_config = QuantConfig(quant_algo=QuantAlgo.FP8, - kv_cache_quant_algo=QuantAlgo.FP8 if fp8_kv_cache - else None) if fp8 else QuantConfig() + kv_cache_config = KvCacheConfig(dtype="fp8" if fp8_kv_cache else "auto") cp_config = { "cp_type": CpType.STAR, "cp_anchor_size": args.sa_anchor_size, @@ -70,7 +67,7 @@ def generate_llm_outputs(args, data, fp8=False, fp8_kv_cache=False): max_input_len=args.max_input_len, max_seq_len=args.max_seq_len, max_num_tokens=args.max_num_tokens, - quant_config=quant_config, + kv_cache_config=kv_cache_config, tensor_parallel_size=1, context_parallel_size=args.num_procs, cp_config=cp_config, diff --git a/tensorrt_llm/_torch/models/modeling_gemma3vl.py b/tensorrt_llm/_torch/models/modeling_gemma3vl.py index e305b82dba..ce8bcc6c8f 100644 --- a/tensorrt_llm/_torch/models/modeling_gemma3vl.py +++ b/tensorrt_llm/_torch/models/modeling_gemma3vl.py @@ -194,11 +194,16 @@ class Gemma3VLM(PreTrainedModel): "text_config", "vision_config" ], f"Expected subconfig name to be either 'text_config' or 'vision_config'. Got {name} instead." pretrained_config = getattr(model_config.pretrained_config, name) + # ModelOpt currently doesn't quantize the vision part. Without setting quant config to None, + # weight loading fails for vision. + quant_config = model_config.quant_config if name == "text_config" else None + # FlashInfer backend supports custom mask which is needed for bidirectional mask in decoder. preferred_backend = "FLASHINFER" if name == "text_config" else "TRTLLM" sub_model_config: ModelConfig[Gemma3Config] = dataclasses.replace( model_config, pretrained_config=pretrained_config, - attn_backend=preferred_backend) + attn_backend=preferred_backend, + quant_config=quant_config) # Make sure some fields that are not explicitly included in the sub config, but present # in the top-level config, are replicated. if (hasattr(sub_model_config.pretrained_config, "torch_dtype") diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py index 1cb86dfdff..78a0d07620 100644 --- a/tensorrt_llm/executor/proxy.py +++ b/tensorrt_llm/executor/proxy.py @@ -317,7 +317,7 @@ class GenerationExecutorProxy(GenerationExecutor): while True: if self.worker_init_status_queue.poll(1): - ready_signal = self.worker_init_status_queue.get() + ready_signal, error_trace = self.worker_init_status_queue.get() break if any(fut.done() for fut in self.mpi_futures): logger.error("Executor worker died during initialization.") @@ -325,6 +325,7 @@ class GenerationExecutorProxy(GenerationExecutor): self._handle_background_error() if ready_signal != GenerationExecutorProxy.READY_SIGNAL: + logger.error(f"Executor worker initialization error: {error_trace}") self.mpi_session.shutdown_abort(reason=ready_signal) raise RuntimeError( "Executor worker returned error") from ready_signal diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index 6d5ec9c1d7..8a1dab6a23 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -774,7 +774,7 @@ def worker_main( logger.error(traceback.format_exc()) print_colored_debug(f"error: {traceback.format_exc()}", "red") if is_leader: - worker_init_status_queue.put(e) + worker_init_status_queue.put((e, traceback.format_exc())) return with worker: @@ -792,7 +792,7 @@ def worker_main( mp_stats_queue) worker._set_iteration_result_queue(worker.kv_events_queues, kv_cache_events_queue) - worker_init_status_queue.put(ready_signal) + worker_init_status_queue.put((ready_signal, None)) while (req := request_queue.get()) is not None: if isinstance(req, CancellingRequest): worker.abort_request(req.id) diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index 9022f7070c..43edb6b62c 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -124,15 +124,21 @@ class BaseLLM: self._executor_cls = kwargs.pop("executor_cls", GenerationExecutor) self._llm_id = None + log_level = logger.level + logger.set_level("info") # force display the backend + try: backend = kwargs.get('backend', None) - if backend == 'pytorch': + if backend == "pytorch": + logger.info("Using LLM with PyTorch backend") llm_args_cls = TorchLlmArgs elif backend == '_autodeploy': + logger.info("Using LLM with AutoDeploy backend") from .._torch.auto_deploy.llm_args import \ LlmArgs as AutoDeployLlmArgs llm_args_cls = AutoDeployLlmArgs else: + logger.info("Using LLM with TensorRT backend") llm_args_cls = TrtLlmArgs # check the kwargs and raise ValueError directly @@ -162,6 +168,9 @@ class BaseLLM: f"Failed to parse the arguments for the LLM constructor: {e}") raise e + finally: + logger.set_level(log_level) # restore the log level + print_colored_debug(f"LLM.args.mpi_session: {self.args.mpi_session}\n", "yellow") self.mpi_session = self.args.mpi_session diff --git a/tensorrt_llm/tools/multimodal_builder.py b/tensorrt_llm/tools/multimodal_builder.py index 9a2096852b..de3943c563 100644 --- a/tensorrt_llm/tools/multimodal_builder.py +++ b/tensorrt_llm/tools/multimodal_builder.py @@ -1190,8 +1190,18 @@ def build_mllama_engine(args): model = MllamaForConditionalGeneration.from_pretrained(args.model_path, torch_dtype='auto', device_map='auto') - wrapper = MLLaMAVisionWrapper(model.vision_model, - model.multi_modal_projector) + + # Check if the model structure is updated to transformers >= 4.52.0 + if hasattr(model, 'model') and hasattr(model.model, 'vision_model'): + vision_model = model.model.vision_model + multi_modal_projector = model.model.multi_modal_projector + else: + # transformers < 4.52.0 + vision_model = model.vision_model + multi_modal_projector = model.multi_modal_projector + + wrapper = MLLaMAVisionWrapper(vision_model, multi_modal_projector) + model_dtype = model.dtype image = Image.new('RGB', [2048, 2688]) # dummy image inputs = processor(images=image, diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml index 0c469f2d94..dbf2be50f3 100644 --- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml +++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml @@ -5,6 +5,9 @@ google/gemma-3-1b-it: accuracy: 20.699 google/gemma-3-27b-it: - accuracy: 28.90 + - quant_algo: FP8 + kv_cache_quant_algo: FP8 + accuracy: 27.90 gpt2: - accuracy: 18.408 - quant_algo: W8A16 diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index 33c264b9e4..ddf3ab5a86 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -150,8 +150,14 @@ speakleash/Bielik-11B-v2.2-Instruct: accuracy: 40.41 google/gemma-3-1b-it: - accuracy: 25.52 # score getting from lm-eval with HF implementation + - quant_algo: FP8 + kv_cache_quant_algo: FP8 + accuracy: 23.96 google/gemma-3-27b-it: - accuracy: 91.66 + - quant_algo: FP8 + kv_cache_quant_algo: FP8 + accuracy: 90.66 mistralai/Ministral-8B-Instruct-2410: - accuracy: 79.25 - quant_algo: FP8 diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index 9dd1c25d3c..9786e417b2 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -119,6 +119,9 @@ google/gemma-3-1b-it: accuracy: 37.5 google/gemma-3-27b-it: - accuracy: 77.80 + - quant_algo: FP8 + kv_cache_quant_algo: FP8 + accuracy: 76.80 Qwen/Qwen2-0.5B-Instruct: - accuracy: 45.30 - quant_algo: FP8 diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 63509cd698..adcc4be979 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -129,37 +129,81 @@ def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any], ctx_total_gpus = ctx_tp * ctx_pp gen_total_gpus = gen_tp * gen_pp - env_ctx = os.environ.copy() - env_ctx["TRTLLM_USE_UCX_KVCACHE"] = "1" - env_ctx["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, range(ctx_total_gpus))) + ctx_urls = disaggregated_server_config["context_servers"]["urls"] + gen_urls = disaggregated_server_config["generation_servers"]["urls"] - env_gen = os.environ.copy() - env_gen["TRTLLM_USE_UCX_KVCACHE"] = "1" - env_gen["CUDA_VISIBLE_DEVICES"] = ",".join( - map(str, range(ctx_total_gpus, ctx_total_gpus + gen_total_gpus))) - ctx_server_args = ctx_args + [ - "--port", "8001", "--extra_llm_api_options", ctx_server_config_path, - f"--tp_size={ctx_tp}", f"--pp_size={ctx_pp}" - ] - gen_server_args = gen_args + [ - "--port", "8002", "--extra_llm_api_options", gen_server_config_path, - f"--tp_size={gen_tp}", f"--pp_size={gen_pp}" - ] - if "max_num_tokens" in ctx_server_config: - ctx_server_args.append( - f"--max_num_tokens={ctx_server_config['max_num_tokens']}") - if "max_num_tokens" in gen_server_config: - gen_server_args.append( - f"--max_num_tokens={gen_server_config['max_num_tokens']}") + ctx_ports = [int(url.split(":")[1]) for url in ctx_urls] + gen_ports = [int(url.split(":")[1]) for url in gen_urls] + + ctx_servers = [] + current_gpu_offset = 0 + + for i, port in enumerate(ctx_ports): + env_ctx = os.environ.copy() + env_ctx["TRTLLM_USE_UCX_KVCACHE"] = "1" + gpu_range = range(current_gpu_offset, + current_gpu_offset + ctx_total_gpus) + env_ctx["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_range)) + current_gpu_offset += ctx_total_gpus + + ctx_server_args = ctx_args + [ + "--port", + str(port), "--extra_llm_api_options", ctx_server_config_path, + f"--tp_size={ctx_tp}", f"--pp_size={ctx_pp}" + ] + if "max_num_tokens" in ctx_server_config: + ctx_server_args.append( + f"--max_num_tokens={ctx_server_config['max_num_tokens']}") + + ctx_servers.append((env_ctx, ctx_server_args)) + + gen_servers = [] + + for i, port in enumerate(gen_ports): + env_gen = os.environ.copy() + env_gen["TRTLLM_USE_UCX_KVCACHE"] = "1" + gpu_range = range(current_gpu_offset, + current_gpu_offset + gen_total_gpus) + env_gen["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, gpu_range)) + current_gpu_offset += gen_total_gpus + + gen_server_args = gen_args + [ + "--port", + str(port), "--extra_llm_api_options", gen_server_config_path, + f"--tp_size={gen_tp}", f"--pp_size={gen_pp}" + ] + if "max_num_tokens" in gen_server_config: + gen_server_args.append( + f"--max_num_tokens={gen_server_config['max_num_tokens']}") + + gen_servers.append((env_gen, gen_server_args)) + + @contextlib.contextmanager + def multi_popen(server_configs): + processes = [] + try: + for env, args in server_configs: + proc = popen(args, env=env) + processes.append(proc) + + with contextlib.ExitStack() as stack: + opened_processes = [ + stack.enter_context(proc) for proc in processes + ] + yield opened_processes + except Exception as e: + print( + f"Failed to start disaggregated server processes in multi_popen: {e}" + ) + raise with (MyThreadPoolExecutor(max_workers=16) as - thread_pool, temp_dir, popen(ctx_server_args, env=env_ctx) as - ctx_server, popen(gen_server_args, env=env_gen) as gen_server, + thread_pool, temp_dir, multi_popen(ctx_servers + gen_servers), popen([ trtllm_serve_path, "disaggregated", "-c", disaggregated_serving_config_path, "--server_start_timeout", "3600" - ]) as disaggregated_server): + ])): start_time = time.time() while time.time() - start_time < 3600: time.sleep(1) @@ -225,17 +269,7 @@ def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any], return future tokenizer = load_hf_tokenizer(model_name) - - try: - yield DuckLLM(args, tokenizer, generate_async) - finally: - ctx_server.terminate() - gen_server.terminate() - disaggregated_server.terminate() - - ctx_server.wait() - gen_server.wait() - disaggregated_server.wait() + yield DuckLLM(args, tokenizer, generate_async) def run_parallel_test(model_name: str, @@ -244,13 +278,18 @@ def run_parallel_test(model_name: str, ctx_tp: int, gen_pp: int, gen_tp: int, + ctx_instances: int, + gen_instances: int, test_sets: List[LlmapiAccuracyTestHarness], ctx_model: str = None, gen_model: str = None): - if ctx_tp * ctx_pp + gen_tp * gen_pp > get_device_count(): + total_ctx_gpus = ctx_tp * ctx_pp * ctx_instances + total_gen_gpus = gen_tp * gen_pp * gen_instances + if total_ctx_gpus + total_gen_gpus > get_device_count(): pytest.fail( - f"Not enough devices for ctx_pp={ctx_pp}+ctx_tp={ctx_tp} and gen_pp={gen_pp}+gen_tp={gen_tp} test" + f"Not enough devices for {ctx_instances} ctx instances (ctx_pp={ctx_pp}*ctx_tp={ctx_tp}) + {gen_instances} gen instances (gen_pp={gen_pp}*gen_tp={gen_tp}), total: {total_ctx_gpus + total_gen_gpus}" ) + kv_cache_config = { "free_gpu_memory_fraction": 0.5, } @@ -272,17 +311,21 @@ def run_parallel_test(model_name: str, "backend": "DEFAULT" } } + + ctx_urls = [f"localhost:{8001 + i * 2}" for i in range(ctx_instances)] + gen_urls = [f"localhost:{8002 + i * 2}" for i in range(gen_instances)] + disaggregated_server_config = { "hostname": "localhost", "port": 8000, "backend": "pytorch", "context_servers": { - "num_instances": 1, - "urls": ["localhost:8001"] + "num_instances": ctx_instances, + "urls": ctx_urls }, "generation_servers": { - "num_instances": 1, - "urls": ["localhost:8002"] + "num_instances": gen_instances, + "urls": gen_urls } } with launch_disaggregated_llm(disaggregated_server_config, @@ -532,8 +575,9 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): if tp * pp * 2 > get_device_count(): pytest.skip(f"Not enough devices for tp={tp}*pp={pp} test") return run_parallel_test(self.MODEL_NAME, self.MODEL_PATH, pp, tp, pp, - tp, [get_accuracy_task(testset)]) + tp, 1, 1, [get_accuracy_task(testset)]) + @pytest.mark.skip_less_device(4) @parametrize_with_ids("ctx_pp", [2, 4]) @parametrize_with_ids("gen_tp", [1, 2]) @pytest.mark.parametrize("testset", ["GSM8K", "MMLU"]) @@ -542,7 +586,13 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): pytest.skip( f"Not enough devices for ctx_pp={ctx_pp}*gen_tp={gen_tp} test") return run_parallel_test(self.MODEL_NAME, self.MODEL_PATH, ctx_pp, 1, 1, - gen_tp, [get_accuracy_task(testset)]) + gen_tp, 1, 1, [get_accuracy_task(testset)]) + + @pytest.mark.skip_less_device(4) + @pytest.mark.parametrize("testset", ["GSM8K", "MMLU"]) + def test_multi_instance(self, testset): + return run_parallel_test(self.MODEL_NAME, self.MODEL_PATH, 1, 1, 1, 1, + 2, 2, [get_accuracy_task(testset)]) @pytest.mark.skip_less_device_memory(140000) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 8879904627..618feaf928 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -842,6 +842,25 @@ class TestGemma3_27BInstruct(LlmapiAccuracyTestHarness): task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + def test_fp8_prequantized(self): + # Disabling kv cache reuse as a WAR to deal with gaps in kernel support for Gemma3's non-inclusive sliding window size. + kv_cache_config = KvCacheConfig(enable_block_reuse=False, + enable_partial_reuse=False, + dtype="fp8") + # Note: This has only the LLM part quantized. Vision part is in bfloat16. + prequantized_model_path = f"{llm_models_root()}/gemma/gemma-3-27b-it-fp8/" + with LLM(prequantized_model_path, + kv_cache_config=kv_cache_config, + attn_backend="FLASHINFER", + cuda_graph_config=None) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8 + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness): MODEL_NAME = "google/gemma-3-1b-it" @@ -875,6 +894,8 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness): assert llm.args.quant_config.quant_algo == QuantAlgo.FP8 task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) task = MMLU(self.MODEL_NAME) task.evaluate(llm) diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py index 0d86204ecb..24000b1f80 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated.py +++ b/tests/integration/defs/disaggregated/test_disaggregated.py @@ -1235,7 +1235,7 @@ def get_config_for_benchmark(model_root, backend): "num_instances": 1, "max_batch_size": 2, "max_num_tokens": 384, - "max_seq_len": 384, + "max_seq_len": 320, "tensor_parallel_size": 1, "pipeline_parallel_size": 1, "disable_overlap_scheduler": True, @@ -1251,7 +1251,7 @@ def get_config_for_benchmark(model_root, backend): "pipeline_parallel_size": 1, "max_batch_size": 2, "max_num_tokens": 384, - "max_seq_len": 384, + "max_seq_len": 320, "cache_transceiver_config": { "backend": backend, "max_tokens_in_buffer": 512, diff --git a/tests/integration/defs/llmapi/_run_llmapi_llm.py b/tests/integration/defs/llmapi/_run_llmapi_llm.py index 854af24efa..14dde17077 100644 --- a/tests/integration/defs/llmapi/_run_llmapi_llm.py +++ b/tests/integration/defs/llmapi/_run_llmapi_llm.py @@ -1,25 +1,32 @@ #!/usr/bin/env python3 import os +from typing import Optional import click -from tensorrt_llm._tensorrt_engine import LLM -from tensorrt_llm.llmapi import BuildConfig, SamplingParams +from tensorrt_llm._tensorrt_engine import LLM as TrtLLM +from tensorrt_llm.llmapi import LLM, BuildConfig, SamplingParams @click.command() @click.option("--model_dir", type=str, required=True) @click.option("--tp_size", type=int, default=1) @click.option("--engine_dir", type=str, default=None) -def main(model_dir: str, tp_size: int, engine_dir: str): +@click.option("--backend", type=str, default=None) +def main(model_dir: str, tp_size: int, engine_dir: str, backend: Optional[str]): build_config = BuildConfig() build_config.max_batch_size = 8 build_config.max_input_len = 256 build_config.max_seq_len = 512 - llm = LLM(model_dir, - tensor_parallel_size=tp_size, - build_config=build_config) + backend = backend or "tensorrt" + assert backend in ["pytorch", "tensorrt"] + + llm_cls = TrtLLM if backend == "tensorrt" else LLM + + kwargs = {} if backend == "pytorch" else {"build_config": build_config} + + llm = llm_cls(model_dir, tensor_parallel_size=tp_size, **kwargs) if engine_dir is not None and os.path.abspath( engine_dir) != os.path.abspath(model_dir): diff --git a/tests/integration/defs/llmapi/test_llm_api_qa.py b/tests/integration/defs/llmapi/test_llm_api_qa.py new file mode 100644 index 0000000000..def4be0895 --- /dev/null +++ b/tests/integration/defs/llmapi/test_llm_api_qa.py @@ -0,0 +1,70 @@ +# Confirm that the default backend is changed +import os + +from defs.common import venv_check_output + +from ..conftest import llm_models_root + +model_path = llm_models_root() + "/llama-models-v3/llama-v3-8b-instruct-hf" + + +class TestLlmDefaultBackend: + """ + Check that the default backend is PyTorch for v1.0 breaking change + """ + + def test_llm_args_type_default(self, llm_root, llm_venv): + # Keep the complete example code here + from tensorrt_llm.llmapi import LLM, KvCacheConfig, TorchLlmArgs + + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4) + llm = LLM(model=model_path, kv_cache_config=kv_cache_config) + + # The default backend should be PyTorch + assert llm.args.backend == "pytorch" + assert isinstance(llm.args, TorchLlmArgs) + + for output in llm.generate(["Hello, world!"]): + print(output) + + def test_llm_args_type_tensorrt(self, llm_root, llm_venv): + # Keep the complete example code here + from tensorrt_llm._tensorrt_engine import LLM + from tensorrt_llm.llmapi import KvCacheConfig, TrtLlmArgs + + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4) + + llm = LLM(model=model_path, kv_cache_config=kv_cache_config) + + # If the backend is TensorRT, the args should be TrtLlmArgs + assert llm.args.backend in ("tensorrt", None) + assert isinstance(llm.args, TrtLlmArgs) + + for output in llm.generate(["Hello, world!"]): + print(output) + + def test_llm_args_logging(self, llm_root, llm_venv): + # It should print the backend in the log + script_path = os.path.join(os.path.dirname(__file__), + "_run_llmapi_llm.py") + print(f"script_path: {script_path}") + + # Test with pytorch backend + pytorch_cmd = [ + script_path, "--model_dir", model_path, "--backend", "pytorch" + ] + + pytorch_output = venv_check_output(llm_venv, pytorch_cmd) + + # Check that pytorch backend keyword appears in logs + assert "Using LLM with PyTorch backend" in pytorch_output, f"Expected 'pytorch' in logs, got: {pytorch_output}" + + # Test with tensorrt backend + tensorrt_cmd = [ + script_path, "--model_dir", model_path, "--backend", "tensorrt" + ] + + tensorrt_output = venv_check_output(llm_venv, tensorrt_cmd) + + # Check that tensorrt backend keyword appears in logs + assert "Using LLM with TensorRT backend" in tensorrt_output, f"Expected 'tensorrt' in logs, got: {tensorrt_output}" diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py index e095f2b85a..2f08739bb2 100644 --- a/tests/integration/defs/perf/pytorch_model_config.py +++ b/tests/integration/defs/perf/pytorch_model_config.py @@ -159,14 +159,14 @@ def get_model_yaml_config(model_label: str, 'llama_v4_maverick_17b_128e_instruct_fp8' ], 'config': { - 'use_cuda_graph': - True, - 'cuda_graph_padding_enabled': - True, - 'cuda_graph_batch_sizes': [ - 1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 1024, 2048, - 4096, 8192 - ] + 'cuda_graph_config': { + 'enable_padding': + True, + 'batch_sizes': [ + 1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 1024, 2048, + 4096, 8192 + ] + } } } ] diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index da4faf578b..334aa13416 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -2446,6 +2446,109 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality): print("All answers are correct!") +@pytest.mark.skip_less_device(2) +@pytest.mark.skip_less_device_memory(80000) +@pytest.mark.parametrize("model_name,model_path", [ + ("gemma-3-27b-it", "gemma/gemma-3-27b-it"), + ("mistral-small-3.1-24b-instruct", "Mistral-Small-3.1-24B-Instruct-2503"), + ("Phi-4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct"), +]) +def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name, + model_path): + example_root = Path(os.path.join(llm_root, "examples", "llm-api")) + test_data_root = Path( + os.path.join(llm_models_root(), "multimodals", "test_data")) + + print(f"Accuracy test {model_name} image mode with example inputs.") + + # Define accuracy inputs for image modality + accuracy_inputs = { + "image": { + "prompt": [ + "Describe the object and the weather condition in the image.", + "Describe the traffic condition on the road in the image.", + ], + "media": [ + str(test_data_root / "inpaint.png"), + str(test_data_root / "61.jpg"), + ], + } + } + + # Define expected keywords for each model + expected_keywords = { + "gemma-3-27b-it": { + "image": [ + ["half", "dome", "yosemite", "landmark", "rounded"], + ["flowing", "traffic", "vehicles", "road", "Changi"], + ], + }, + "mistral-small-3.1-24b-instruct": { + "image": [ + ["scenic", "rock", "landscape", "monolith", "formation"], + [ + "multi-lane", "highway", "moderate", "traffic", "flow", + "vehicles", "congestion" + ], + ], + }, + "Phi-4-multimodal-instruct": { + "image": [ + ["image", "depicts", "mountain", "half", "rock"], + ["road", "car", "lane", "traffic", "bus"], + ], + }, + } + + # Build command for image modality + cmd = [ + str(example_root / "quickstart_multimodal.py"), + "--model_dir", + f"{llm_models_root()}/{model_path}", + "--modality", + "image", + "--prompt", + *accuracy_inputs["image"]["prompt"], + "--media", + *accuracy_inputs["image"]["media"], + "--tp_size", + "2", + ] + + # Add model-specific configurations + if model_name == "gemma-3-27b-it": + # Gemma3 VLM needs a custom mask which is only supported by flashinfer backend currently. + # Custom mask involves bidirectional masking of image tokens in context phase. To get this + # correct, chunked prefill and kv cache reuse need to be turned off. + cmd.append("--image_format=pil") + cmd.append("--attention_backend=FLASHINFER") + cmd.append("--disable_kv_cache_reuse") + elif model_name == "Phi-4-multimodal-instruct": + # Set max_seq_len to 4096 to use short rope factor. + cmd.append("--max_seq_len=4096") + cmd.append("--load_lora") + cmd.append("--auto_model_name") + cmd.append("Phi4MMForCausalLM") + + output = llm_venv.run_cmd(cmd, caller=check_output) + + # Set match ratio based on model + match_ratio = 4.0 / 5 + if model_name == "Phi-4-multimodal-instruct": + match_ratio = 0.6 + + # Check output accuracy + for prompt_output, prompt_keywords in zip( + parse_output(output), expected_keywords[model_name]["image"]): + matches = [ + keyword in prompt_output.lower() for keyword in prompt_keywords + ] + obs_match_ratio = 1. * sum(matches) / len(matches) + assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}" + + print("All answers are correct!") + + @pytest.mark.parametrize("model_name,model_path", [ ("BertForSequenceClassification", "bert/bert-base-uncased-yelp-polarity"), ]) diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt index e28f1bcecd..6af50c17b6 100644 --- a/tests/integration/test_lists/qa/llm_function_full.txt +++ b/tests/integration/test_lists/qa/llm_function_full.txt @@ -464,6 +464,7 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagl accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle3_tp8[eagle3_one_model=False] accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True] @@ -571,6 +572,8 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=4] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_multi_instance[GSM8K] +accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_multi_instance[MMLU] accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False] accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True] accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] @@ -645,6 +648,9 @@ test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it- test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[audio] test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image] test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image_audio] +test_e2e.py::test_ptp_quickstart_multimodal_2gpu[gemma-3-27b-it-gemma/gemma-3-27b-it] +test_e2e.py::test_ptp_quickstart_multimodal_2gpu[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503] +test_e2e.py::test_ptp_quickstart_multimodal_2gpu[Phi-4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct] test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] @@ -722,3 +728,9 @@ disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyL # These tests will impact triton. They should be at the end of all tests (https://nvbugs/4904271) # examples/test_openai.py::test_llm_openai_triton_1gpu # examples/test_openai.py::test_llm_openai_triton_plugingen_1gpu + +# llm-api promote pytorch to default +llmapi/test_llm_api_qa.py::TestLlmDefaultBackend::test_llm_args_logging +llmapi/test_llm_api_qa.py::TestLlmDefaultBackend::test_llm_args_type_tensorrt +llmapi/test_llm_api_qa.py::TestLlmDefaultBackend::test_llm_args_type_default +llmapi/test_llm_api_qa.py::TestLlmDefaultBackend::test_llm_args_logging diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt index 51c452cbc7..7f9c03d963 100644 --- a/tests/integration/test_lists/qa/llm_function_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_sanity.txt @@ -47,6 +47,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True] accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index a3179e38e9..36fcdce532 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -54,6 +54,8 @@ l0_dgx_h100: - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=2] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2] + - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_multi_instance[GSM8K] + - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_multi_instance[MMLU] - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend - test_e2e.py::test_ptp_quickstart_advanced_bs1 diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index bc84082317..0263c452b3 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -218,6 +218,7 @@ l0_h100: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized + - accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized - accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype - accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index fb7a7a50b9..8ade511ce5 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -297,6 +297,7 @@ triton_server/test_triton.py::test_mistral_ib_mm[mistral-ib-mm] SKIP (https://nv triton_server/test_triton.py::test_t5_ib[t5-ib] SKIP (https://nvbugs/5456482) triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[False-False-1---False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict---1-1-1-False-ensemble] SKIP (https://nvbugs/5456485) accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=4] SKIP (https://nvbugs/5434320) +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437384) accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend SKIP (https://nvbugs/5448437) accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend SKIP (https://nvbugs/5448437) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] SKIP (https://nvbugs/5445466) diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index 6b78c46bd7..66d946d5c6 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -4,6 +4,7 @@ from contextlib import contextmanager, nullcontext import pytest from tensorrt_llm import LLM +from tensorrt_llm.executor import GenerationExecutorWorker from tensorrt_llm.llmapi import KvCacheConfig from tensorrt_llm.llmapi.llm_args import PeftCacheConfig from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer @@ -818,3 +819,40 @@ class TestLlmError: match="should not exceed max_num_tokens"): ids = [random.randint(10, 100) for _ in range(101)] llm.generate([ids]) + + +class FailingExecutorWorker(GenerationExecutorWorker): + """Mock worker that fails during initialization to test error handling.""" + + def __init__(self, *args, **kwargs): + # Simulate a constructor failure + raise RuntimeError( + "Mock GenerationExecutorWorker initialization failed") + + +FailingExecutor = type( + "FailingExecutor", (), { + "create": + classmethod( + lambda cls, *args, **kwargs: FailingExecutorWorker(*args, **kwargs)) + }) + + +def test_llm_with_proxy_error(): + """Test that LLM properly handles GenerationExecutorWorker constructor failures. + + This test mocks the GenerationExecutorWorker to fail during __init__ and + verifies that the LLM class properly catches and re-raises the error. + """ + from unittest.mock import patch + + # Test that the error is properly caught and re-raised by LLM + # We patch GenerationExecutor.create directly to return our failing worker + with patch('tensorrt_llm.executor.executor.GenerationExecutor.create', + side_effect=lambda *args, **kwargs: FailingExecutorWorker( + *args, **kwargs)): + with pytest.raises( + RuntimeError, + match="Mock GenerationExecutorWorker initialization failed"): + llm = LLM(model=llama_model_path, + kv_cache_config=global_kvcache_config) diff --git a/triton_backend/inflight_batcher_llm/scripts/build.sh b/triton_backend/inflight_batcher_llm/scripts/build.sh index 44a5550021..031d623d69 100644 --- a/triton_backend/inflight_batcher_llm/scripts/build.sh +++ b/triton_backend/inflight_batcher_llm/scripts/build.sh @@ -53,7 +53,8 @@ fi # TODO: Remove specifying Triton version after cmake version is upgraded to 3.31.8 # Get TRITON_SHORT_TAG from docker/Dockerfile.multi -LLM_ROOT="$(dirname $0)/../../../.." +LLM_ROOT=$BUILD_DIR/../../.. +LLM_ROOT=$(cd -- "$LLM_ROOT" && pwd) TRITON_SHORT_TAG=$("$LLM_ROOT/jenkins/scripts/get_triton_tag.sh" "$LLM_ROOT") cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ${BUILD_TESTS_ARG} -DTRITON_COMMON_REPO_TAG=${TRITON_SHORT_TAG} -DTRITON_CORE_REPO_TAG=${TRITON_SHORT_TAG} -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_SHORT_TAG} -DTRITON_BACKEND_REPO_TAG=${TRITON_SHORT_TAG} .. make install From c5036cb53661ade3d2d3faa7dded317d5dcb8e42 Mon Sep 17 00:00:00 2001 From: Suyog Gupta <41447211+suyoggupta@users.noreply.github.com> Date: Thu, 21 Aug 2025 18:41:44 -0700 Subject: [PATCH 17/33] [None][docs] update stale link for AutoDeploy (#7135) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1559ee4d00..def2a7cb5f 100644 --- a/README.md +++ b/README.md @@ -253,5 +253,5 @@ Deprecation is used to inform developers that some APIs and tools are no longer ## Useful Links - [Quantized models on Hugging Face](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4): A growing collection of quantized (e.g., FP8, FP4) and optimized LLMs, including [DeepSeek FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), ready for fast inference with TensorRT-LLM. - [NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo): A datacenter scale distributed inference serving framework that works seamlessly with TensorRT-LLM. -- [AutoDeploy](./examples/auto_deploy/README.md): A prototype backend for TensorRT-LLM to simplify and accelerate the deployment of PyTorch models. +- [AutoDeploy](https://nvidia.github.io/TensorRT-LLM/torch/auto_deploy/auto-deploy.html): A prototype backend for TensorRT-LLM to simplify and accelerate the deployment of PyTorch models. - [WeChat Discussion Group](https://github.com/NVIDIA/TensorRT-LLM/issues/5359): A real-time channel for TensorRT-LLM Q&A and news. From 07c711eb1f759cbf3da02ef7e93bd050fdbf1c83 Mon Sep 17 00:00:00 2001 From: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com> Date: Fri, 22 Aug 2025 10:00:04 +0800 Subject: [PATCH 18/33] [TRTLLM-6825][fix] Update lora for phi4-mm (#6817) Signed-off-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com> --- tensorrt_llm/_torch/models/modeling_phi4mm.py | 10 ++++----- tensorrt_llm/_torch/pyexecutor/_util.py | 3 ++- .../_torch/pyexecutor/model_engine.py | 9 +++++--- .../_torch/pyexecutor/resource_manager.py | 3 ++- tensorrt_llm/lora_helper.py | 1 + tensorrt_llm/lora_manager.py | 22 ++++++++++--------- .../defs/perf/pytorch_model_config.py | 6 +++-- tests/integration/defs/test_e2e.py | 6 ++--- 8 files changed, 34 insertions(+), 26 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_phi4mm.py b/tensorrt_llm/_torch/models/modeling_phi4mm.py index ee0263eb5e..bc449e1da5 100644 --- a/tensorrt_llm/_torch/models/modeling_phi4mm.py +++ b/tensorrt_llm/_torch/models/modeling_phi4mm.py @@ -611,23 +611,21 @@ class Phi4MMForCausalLM(transformers.PreTrainedModel): @staticmethod def lora_config(model_dir: str): _lora_config = LoraConfig( - lora_dir=[ - f"{model_dir}/vision-lora", - f"{model_dir}/speech-lora", - ], lora_target_modules=[ "attn_qkv", "attn_dense", - "mlp_h_to_4h", + "mlp_gate_up", "mlp_4h_to_h", ], trtllm_modules_to_hf_modules={ "attn_qkv": "qkv_proj", "attn_dense": "o_proj", - "mlp_h_to_4h": "gate_up_proj", + "mlp_gate_up": "gate_up_proj", "mlp_4h_to_h": "down_proj", }, max_lora_rank=320, # Max rank for Phi4MM. + swap_gate_up_proj_lora_b_weight= + False, # Disable swap gate_up_proj.lora_B.weight for Phi4MM. ) return _lora_config diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py index 2f0753ed31..20e3aaaa09 100644 --- a/tensorrt_llm/_torch/pyexecutor/_util.py +++ b/tensorrt_llm/_torch/pyexecutor/_util.py @@ -514,7 +514,8 @@ def create_py_executor_instance( resources[ResourceManagerType.PEFT_CACHE_MANAGER] = peft_cache_manager model_engine.set_lora_model_config( lora_config.lora_target_modules, - lora_config.trtllm_modules_to_hf_modules) + lora_config.trtllm_modules_to_hf_modules, + lora_config.swap_gate_up_proj_lora_b_weight) max_num_sequences = executor_config.max_batch_size * mapping.pp_size diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index d9f180c0fc..1b3fbfbfc4 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -468,13 +468,16 @@ class PyTorchModelEngine(ModelEngine): def runtime_draft_len(self): return self.max_draft_len if self.enable_spec_decode else 0 - def set_lora_model_config(self, lora_target_modules: list[str], - trtllm_modules_to_hf_modules: dict[str, str]): + def set_lora_model_config(self, + lora_target_modules: list[str], + trtllm_modules_to_hf_modules: dict[str, str], + swap_gate_up_proj_lora_b_weight: bool = True): self.lora_model_config = LoraModelConfig( lora_target_modules=lora_target_modules, trtllm_modules_to_hf_modules=trtllm_modules_to_hf_modules, hidden_size=self.model.config.hidden_size, - dtype=torch_dtype_to_str(self.model.config.torch_dtype)) + dtype=torch_dtype_to_str(self.model.config.torch_dtype), + swap_gate_up_proj_lora_b_weight=swap_gate_up_proj_lora_b_weight) @property def use_mrope(self): diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py index 9a5b42166d..4066b45cf8 100644 --- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py +++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py @@ -1040,7 +1040,8 @@ class PeftCacheManager(BaseResourceManager): self._lora_model_config = LoraModelConfig( lora_config.lora_target_modules, lora_config.trtllm_modules_to_hf_modules, model_config.hidden_size, - binding_to_str_dtype(model_config.data_type)) + binding_to_str_dtype(model_config.data_type), + lora_config.swap_gate_up_proj_lora_b_weight) self._lora_manager = LoraManager() def add_request_peft(self, request: LlmRequest): diff --git a/tensorrt_llm/lora_helper.py b/tensorrt_llm/lora_helper.py index 37f5d534f7..719df51079 100644 --- a/tensorrt_llm/lora_helper.py +++ b/tensorrt_llm/lora_helper.py @@ -88,6 +88,7 @@ class LoraConfig(DictConversion): trtllm_modules_to_hf_modules: Dict[str, str] = field(default_factory=dict) max_loras: Optional[int] = None max_cpu_loras: Optional[int] = None + swap_gate_up_proj_lora_b_weight: bool = True def __post_init__(self): assert self.lora_ckpt_source in [ diff --git a/tensorrt_llm/lora_manager.py b/tensorrt_llm/lora_manager.py index 7440715474..f1ca920415 100644 --- a/tensorrt_llm/lora_manager.py +++ b/tensorrt_llm/lora_manager.py @@ -243,6 +243,7 @@ class LoraModelConfig: trtllm_modules_to_hf_modules: dict[str, str] hidden_size: int dtype: str + swap_gate_up_proj_lora_b_weight: bool = True class HfLoraLoader: @@ -968,16 +969,17 @@ class LoraManager(object): ) hf_modules = set(hf_modules_to_trtllm_modules.keys()) - def preprocess_lora_weights(lora_model): + def preprocess_lora_weights(lora_model, model_config): # Swap weights of gate_up_proj - for key, value in lora_model.items(): - if "gate_up_proj.lora_B.weight" in key: - original_weights = value.contiguous().clone() - half_split = original_weights.shape[0] // 2 - first_half = original_weights[:half_split, :] - second_half = original_weights[half_split:, :] - value = torch.cat((second_half, first_half), dim=0) - lora_model[key] = value + if getattr(model_config, "swap_gate_up_proj_lora_b_weight", True): + for key, value in lora_model.items(): + if "gate_up_proj.lora_B.weight" in key: + original_weights = value.contiguous().clone() + half_split = original_weights.shape[0] // 2 + first_half = original_weights[:half_split, :] + second_half = original_weights[half_split:, :] + value = torch.cat((second_half, first_half), dim=0) + lora_model[key] = value return lora_model def load_from_model_dir(uid, model_dir, hf_config): @@ -989,7 +991,7 @@ class LoraManager(object): lora_model = load_state_dict(get_model_path(model_dir, "adapter_model")) if lora_model is None: raise ValueError(f"Failed to load adapter_model from {model_dir}") - lora_model = preprocess_lora_weights(lora_model) + lora_model = preprocess_lora_weights(lora_model, model_config) all_weights = get_all_hf_lora_weights(lora_model, hf_modules, component) rank = int(hf_config["r"]) rs_lora = bool(hf_config.get("use_rslora", False)) diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py index 2f08739bb2..15354b36ea 100644 --- a/tests/integration/defs/perf/pytorch_model_config.py +++ b/tests/integration/defs/perf/pytorch_model_config.py @@ -191,15 +191,17 @@ def get_model_yaml_config(model_label: str, } if 'phi_4_multimodal_instruct' in model_label: lora_config['lora_config']['lora_target_modules'] = [ - "attn_qkv", "attn_dense", "mlp_h_to_4h", "mlp_4h_to_h" + "attn_qkv", "attn_dense", "mlp_gate_up", "mlp_4h_to_h" ] lora_config['lora_config']['trtllm_modules_to_hf_modules'] = { "attn_qkv": "qkv_proj", "attn_dense": "o_proj", - "mlp_h_to_4h": "gate_up_proj", + "mlp_gate_up": "gate_up_proj", "mlp_4h_to_h": "down_proj" } lora_config['lora_config']['max_lora_rank'] = 320 + lora_config['lora_config'][ + 'swap_gate_up_proj_lora_b_weight'] = False base_config.update(lora_config) kv_cache_config = base_config.get('kv_cache_config', KvCacheConfig()) diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 334aa13416..bb38e94aac 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -2404,15 +2404,15 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality): } expected_keywords = { "image": [ - ["image", "depicts", "mountain", "half", "rock"], - ["road", "car", "lane", "traffic", "bus"], + ["object", "mountain", "weather", "clear", "clouds"], + ["traffic", "road", "vehicles", "cars", "bus"], ], "audio": [ ["what", "is", "the", "traffic", "sign", "in", "image"], ["what", "is", "shown", "in", "this", "image"], ], "image_audio": [ - ["image", "depicts", "Grand", "rock", "scene"], + ["image", "depicts", "scenic", "famous", "landmark"], ], } From 4017f7cd6be52cfafd2eda6d0b5d8995de600f5d Mon Sep 17 00:00:00 2001 From: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> Date: Fri, 22 Aug 2025 10:39:25 +0800 Subject: [PATCH 19/33] [None][chore] Add failed cases into waives.txt (#7109) Signed-off-by: Xin He (SW-GPU) <200704525+xinhe-nv@users.noreply.github.com> --- tests/integration/test_lists/waives.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 8ade511ce5..9d1a19250c 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -321,3 +321,6 @@ full:H100/accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_eagle full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True] SKIP (https://nvbugs/5467815) full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True] SKIP (https://nvbugs/5467815) accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5470769) +full:L40S/accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False] SKIP (https://nvbugs/5347051) +full:L40S/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] SKIP (https://nvbugs/5471106) +full:L40S/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp2] SKIP (https://nvbugs/5471108) From 983dd7e57c300b6647fb1e7e57653840d0bd2fa8 Mon Sep 17 00:00:00 2001 From: Yukun He <23156053+hyukn@users.noreply.github.com> Date: Fri, 22 Aug 2025 12:28:30 +0800 Subject: [PATCH 20/33] [None][fix] Fix mm_placholder_counts extraction issue. (#7118) Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com> --- tensorrt_llm/inputs/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorrt_llm/inputs/utils.py b/tensorrt_llm/inputs/utils.py index a5e0c0a5a4..458b0a11d8 100644 --- a/tensorrt_llm/inputs/utils.py +++ b/tensorrt_llm/inputs/utils.py @@ -433,7 +433,7 @@ def apply_chat_template( if model_type in PLACEHOLDER_EXCEPTIONS: # flattened content do not work for these models, so go back to other formats as needed conversation = handle_placeholder_exceptions(model_type, conversation, - mm_placeholder_counts) + [mm_placeholder_counts]) return tokenizer.apply_chat_template( conversation=conversation, From 099f081e03ef8c92cadeb148915edad368b6daca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20C=C3=A1mpora?= <961215+dcampora@users.noreply.github.com> Date: Fri, 22 Aug 2025 08:09:30 +0200 Subject: [PATCH 21/33] [TRTLLM-7155][feat] Unify sampler handle logits implementation. (#6867) Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> --- .../pyexecutor/executor_request_queue.py | 11 +- .../_torch/pyexecutor/handle_logits.py | 23 ++- tensorrt_llm/_torch/pyexecutor/py_executor.py | 98 ++++++------ tensorrt_llm/_torch/pyexecutor/sampler.py | 149 ++++++++---------- .../_torch/speculative/model_drafter.py | 17 +- tensorrt_llm/_torch/speculative/mtp.py | 6 +- .../defs/accuracy/test_llm_api_pytorch.py | 35 ++++ .../_torch/sampler/test_return_logits.py | 6 - 8 files changed, 201 insertions(+), 144 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py index 96c5957ef9..8cfccb020a 100644 --- a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py +++ b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py @@ -16,7 +16,6 @@ from tensorrt_llm.mapping import CpType from ..distributed import Distributed from .llm_request import (ExecutorRequest, LlmRequest, executor_request_to_llm_request) -from .sampler import Sampler, TorchSampler SHUTDOWN_REQUEST_ID = -1 @@ -707,21 +706,19 @@ class ExecutorRequestQueue: def set_exclude_last_generation_logits(self, disable_overlap_scheduler: bool, - sampler: Sampler) -> None: + pp_size: int) -> None: # When overlap scheduler is enabled then when starting to handle a new prompt, # sample_async is called twice before the first call to update_requests: # - 1st time as a context request that handles on the 1st generated token # - 2nd time as a generation request that handles on the 2nd generated token. # and only after these two calls the sampler's update_request method is called. # So in a sampler that works by the expected flow of handling the logits in - # sample_async (TorchSampler is an anomaly that instead does that on - # update_requests), every update_request doesn't handle the newest token, but one + # sample_async, every update_request doesn't handle the newest token, but one # before it. Since all these calls work on the same request object, then its # logits storage contains the logits of both the token update_requests should work # on, and also its next token. Thus, excluding the last generation logits from any - # getter is required, when not using TorchSampler. - self.should_exclude_last_generation_logits = not disable_overlap_scheduler and not isinstance( - sampler, TorchSampler) + # getter is required. + self.should_exclude_last_generation_logits = not disable_overlap_scheduler and pp_size == 1 def _should_exclude_last_generation_logits(self) -> bool: return self.should_exclude_last_generation_logits diff --git a/tensorrt_llm/_torch/pyexecutor/handle_logits.py b/tensorrt_llm/_torch/pyexecutor/handle_logits.py index 81986df593..b3d7ced6a5 100644 --- a/tensorrt_llm/_torch/pyexecutor/handle_logits.py +++ b/tensorrt_llm/_torch/pyexecutor/handle_logits.py @@ -1,3 +1,4 @@ +from itertools import chain from typing import List import torch @@ -16,9 +17,9 @@ class HandleLogits: context_requests: List[LlmRequest], generation_requests: List[LlmRequest], logits: torch.Tensor, - num_context_logits_prefix_sum: List[int], - max_num_sequences: int, beam_width: int, + num_context_logits_prefix_sum: list[int], + is_generation_model: bool, ): """Handles context and generation logits for a batch of requests. @@ -26,10 +27,24 @@ class HandleLogits: context_requests: List of context requests to process generation_requests: List of generation requests to process logits: Input logits tensor - num_context_logits_prefix_sum: Prefix sum of context logits for each request - max_num_sequences: Maximum number of sequences to process beam_width: Beam width for the generation requests + num_context_logits_prefix_sum: Prefix sum of the logits + is_generation_model: Bool containing whether the model is generation or not """ + if not any(r.py_return_context_logits or r.py_return_generation_logits + for r in chain(context_requests, generation_requests)): + return + + if not is_generation_model: + for llm_req, logits_temp in zip(context_requests, logits): + if logits_temp.ndim == 1: + # For BERT: Add axis to be compatible with LogitsStorage + # (LogitsStorage will interpret this dim as the prompt_len which + # is not relevant for outputting logits of encoder only model). + logits_temp = logits_temp.unsqueeze(0) + llm_req.py_result.append_context_logits(logits_temp) + return + # Copy logits into decoderBuffers.logits for batch_index, llm_req in enumerate(context_requests): logits_begin = num_context_logits_prefix_sum[batch_index] diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index a40b9b9045..453434d9d6 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -39,6 +39,7 @@ from ..models.modeling_utils import DecoderModelForCausalLM from ..speculative.drafter import Drafter from .executor_request_queue import ExecutorRequestQueue, RequestQueueItem from .guided_decoder import GuidedDecoder +from .handle_logits import HandleLogits from .kv_cache_transceiver import KvCacheTransceiver from .llm_request import (ExecutorRequest, LlmRequest, LlmRequestState, LlmResponse) @@ -244,7 +245,7 @@ class PyExecutor: is_disaggregated=kv_cache_transceiver is not None, ) self.executor_request_queue.set_exclude_last_generation_logits( - self.disable_overlap_scheduler, self.sampler) + self.disable_overlap_scheduler, self.dist.pp_size) self.stats_lock = threading.Lock() self.stats = [] @@ -681,24 +682,6 @@ class PyExecutor: self.response_cv.notify_all() self.shutdown_event.set() - def _need_return_logits(self, scheduled_requests: ScheduledRequests): - for req in scheduled_requests.context_requests: - if req.py_return_context_logits: - return True - for req in scheduled_requests.generation_requests: - if req.py_return_generation_logits: - return True - return False - - def _need_return_log_probs(self, scheduled_requests: ScheduledRequests): - for req in scheduled_requests.context_requests: - if req.py_return_log_probs: - return True - for req in scheduled_requests.generation_requests: - if req.py_return_log_probs: - return True - return False - def _executor_loop_pp(self): logger.debug(f"Starting executor loop for pp_rank {self.dist.pp_rank}") torch.cuda.set_device(self.device_id) @@ -790,10 +773,6 @@ class PyExecutor: else: with torch.cuda.nvtx.range("_forward_step_last_pp"): batch_outputs = self._forward_step(scheduled_batch) - logits_host = None - if self._need_return_logits(scheduled_batch): - logits_host = batch_outputs["logits"].to( - "cpu", non_blocking=True) if self.kv_cache_transceiver and self.guided_decoder: self.guided_decoder.init_disagg_gen_requests( scheduled_batch) @@ -802,7 +781,6 @@ class PyExecutor: sample_state = self._sample_async( scheduled_batch, batch_outputs) - sample_state.host.logits = logits_host self._update_request_states(scheduled_batch) if self.enable_iter_perf_stats: @@ -832,18 +810,10 @@ class PyExecutor: torch.cuda.nvtx.range_push( "_handle_new_tokens_inter_pp") # Receive tokens from previous pp rank (w.r.t model forward direction) - ( - logits, - sample_state.host, - ) = self.dist.recv_object( + sample_state.host = self.dist.recv_object( src=self.dist.prev_pp_rank, tag=prev_microbatch_id, ) - if logits is not None: - logits_host = torch.from_numpy(logits) - sample_state.host.logits = logits_host - sample_state.device.logits = logits_host.to( - self.device_id) else: torch.cuda.nvtx.range_push("_handle_new_tokens_last_pp") sample_state.sampler_event.synchronize() @@ -853,18 +823,9 @@ class PyExecutor: if not self.dist.is_second_last_pp_rank: if self.send_handles[prev_microbatch_id] is not None: self.send_handles[prev_microbatch_id].wait() - needs_logits = ( - self._need_return_logits(scheduled_batch) - or (self._need_return_log_probs(scheduled_batch) - and sample_state.host.log_probs is not None)) - serialized_logits = sample_state.host.logits.numpy( - ) if needs_logits else None self.send_handles[ prev_microbatch_id] = self.dist.isend_object( - ( - serialized_logits, - sample_state.host, - ), + sample_state.host, dest=self.dist.next_pp_rank, tag=prev_microbatch_id) torch.cuda.nvtx.range_pop() @@ -884,6 +845,40 @@ class PyExecutor: previous_batch.scheduled_ctx_reqs) self._handle_canceled_requests() + + # If logits were requested last PP rank has to send to first PP rank (who sends responses) the + # logits of the requests that have finished. + # NOTE: If the rank processing the logits ever becomes the same as + # the rank sending the responses, this code can be removed. + finished_reqs = [ + r for r in previous_batch.sample_state. + scheduled_requests.all_requests() + if r.state == LlmRequestState.GENERATION_COMPLETE + and (r.py_return_context_logits + or r.py_return_generation_logits) + ] + if self.dist.is_first_pp_rank and len(finished_reqs): + finished_reqs_py_results = [ + r.py_result for r in finished_reqs + ] + finished_reqs_py_results = self.dist.recv_object( + src=self.dist.prev_pp_rank, + tag=prev_microbatch_id, + ) + for req, py_result in zip(finished_reqs, + finished_reqs_py_results): + req.py_result = py_result + + elif self.dist.is_last_pp_rank and len(finished_reqs): + if self.send_handles[ + prev_microbatch_id] is not None: + self.send_handles[prev_microbatch_id].wait() + self.send_handles[ + prev_microbatch_id] = self.dist.isend_object( + [r.py_result for r in finished_reqs], + dest=self.dist.next_pp_rank, + tag=prev_microbatch_id) + finished_requests = self._handle_responses() previous_scheduled_batch = previous_batch.sample_state.scheduled_requests self.resource_manager.update_resources( @@ -1538,7 +1533,22 @@ class PyExecutor: batch_outputs) -> SampleState | None: try: if batch_outputs is not None: - return self.sampler.sample_async(scheduled_batch, batch_outputs) + num_context_logits_prefix_sum = [0] + prefix_sum = 0 + for request in scheduled_batch.context_requests: + prefix_sum += request.context_chunk_size if request.py_return_context_logits else 1 + num_context_logits_prefix_sum.append(prefix_sum) + + HandleLogits()(scheduled_batch.context_requests, + scheduled_batch.generation_requests, + batch_outputs["logits"], + self.sampler.beam_width( + scheduled_batch.all_requests()), + num_context_logits_prefix_sum, + self.sampler.is_generation_model()) + + return self.sampler.sample_async(scheduled_batch, batch_outputs, + num_context_logits_prefix_sum) except Exception as e: traceback.print_exc() error_msg = str(e) diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py index 919b99be2d..e6d19a9df4 100644 --- a/tensorrt_llm/_torch/pyexecutor/sampler.py +++ b/tensorrt_llm/_torch/pyexecutor/sampler.py @@ -5,7 +5,6 @@ from typing import List, Literal, Optional import torch -from tensorrt_llm._torch.pyexecutor.handle_logits import HandleLogits from tensorrt_llm._torch.pyexecutor.make_decoding_batch_input_output import \ MakeDecodingBatchInputOutput from tensorrt_llm._utils import nvtx_range, torch_dtype_to_binding @@ -30,7 +29,6 @@ from .scheduler import ScheduledRequests @dataclass(kw_only=True) class SampleStateTensors: new_tokens: torch.Tensor - logits: torch.Tensor | None = None log_probs: torch.Tensor | None = None def values(self): @@ -58,14 +56,24 @@ class Sampler(ABC): return None @abstractmethod - def sample_async(self, scheduled_requests: ScheduledRequests, - model_outputs) -> SampleState: + def sample_async(self, scheduled_requests: ScheduledRequests, model_outputs, + num_context_logits_prefix_sum: list[int]) -> SampleState: raise NotImplementedError @abstractmethod def update_requests(self, state: SampleState) -> None: raise NotImplementedError + @staticmethod + def beam_width(scheduled_requests: Iterable[LlmRequest]) -> int: + for req in scheduled_requests: + return req.sampling_config.beam_width + return 0 + + @abstractmethod + def is_generation_model(self) -> bool: + raise NotImplementedError + class EarlyStopSampler(Sampler): """ @@ -73,10 +81,9 @@ class EarlyStopSampler(Sampler): such as encoder-only model (e.g., BERT) or reward models that only need context phase. """ - def sample_async(self, scheduled_requests: ScheduledRequests, - model_outputs) -> SampleState: - host = SampleStateTensors(logits=model_outputs['logits'], - new_tokens=torch.empty(0)) + def sample_async(self, scheduled_requests: ScheduledRequests, model_outputs, + num_context_logits_prefix_sum: list[int]) -> SampleState: + host = SampleStateTensors(new_tokens=torch.empty(0)) return SampleState(scheduled_requests=scheduled_requests, host=host) def update_requests(self, state: SampleState) -> None: @@ -87,14 +94,9 @@ class EarlyStopSampler(Sampler): request.state = LlmRequestState.GENERATION_COMPLETE # NOTE: This is a hack: set finish reason manually and set the beam 0 request.set_finished_reason(FinishReason.LENGTH, 0) - if request.py_return_context_logits: - logits = state.host.logits[idx] - if logits.ndim == 1: - # For BERT: Add axis to be compatible with LogitsStorage - # (LogitsStorage will interpret this dim as the prompt_len which - # is not relevant for outputting logits of encoder only model). - logits = logits.unsqueeze(0) - request.py_result.append_context_logits(logits) + + def is_generation_model(self) -> bool: + return False @dataclass(kw_only=True) @@ -117,8 +119,10 @@ class EarlyStopWithMMResult(Sampler): Use for skipping decoding step for non generation model, and return the batch_output (such as mm_embeddings) """ - def sample_async(self, scheduled_requests: ScheduledRequests, - model_outputs) -> SampleStateWithMMResult: + def sample_async( + self, scheduled_requests: ScheduledRequests, model_outputs, + num_context_logits_prefix_sum: list[int] + ) -> SampleStateWithMMResult: # from model_outputs to MultimodalResult data = MultimodalResult(mm_embeddings=model_outputs['mm_embeddings']) return SampleStateWithMMResult(scheduled_requests=scheduled_requests, @@ -141,6 +145,9 @@ class EarlyStopWithMMResult(Sampler): request.py_result.append_mm_embeddings(mm_embedding) + def is_generation_model(self) -> bool: + return False + def top_k_sampling_batch(logits, top_k=50, @@ -352,6 +359,9 @@ class TorchSampler(Sampler): BEAM = 0 MAX_BEAM_WIDTH = BEAM + 1 + def is_generation_model(self) -> bool: + return True + @dataclass(frozen=True, kw_only=True) class Store: new_tokens: torch.Tensor @@ -445,13 +455,9 @@ class TorchSampler(Sampler): return False - def handle_logits(self, request: LlmRequest, state: SampleState, *, - beam: int, count: int): + def handle_logprobs(self, request: LlmRequest, state: SampleState, *, + beam: int, count: int): current_slice = slice(0, count), request.py_seq_slot, beam - if request.py_return_generation_logits: - assert state.host.logits is not None - current_logits = state.host.logits[current_slice] - request.py_result.append_generation_logits(current_logits) if request.py_return_log_probs: assert state.host.log_probs is not None log_probs = state.host.log_probs[request.py_seq_slot][beam][:count] @@ -546,7 +552,7 @@ class TorchSampler(Sampler): continue new_token = add_token(req, new_tokens, beam=self.BEAM) self._handle_stop_criteria(req, new_token) - self.handle_logits(req, state, beam=self.BEAM, count=1) + self.handle_logprobs(req, state, beam=self.BEAM, count=1) req.py_decoding_iter += 1 for req in state.scheduled_requests.generation_requests: @@ -558,37 +564,28 @@ class TorchSampler(Sampler): req.py_num_accepted_draft_tokens = num_accepted req.py_rewind_len = req.py_draft_pages_allocated - num_accepted processed += num_accepted - self.handle_logits(req, state, beam=self.BEAM, count=processed) + self.handle_logprobs(req, state, beam=self.BEAM, count=processed) req.py_decoding_iter += 1 - def log_probs_host(self, requests: Iterable[LlmRequest]): + def log_probs_host(self, scheduled_requests: ScheduledRequests): """Shape: In lockstep with TRTLLMSampler: https://github.com/NVIDIA/TensorRT-LLM/blob/cea5dd1e3883b18bf50901a7f196f50a9544c28c/cpp/include/tensorrt_llm/runtime/decoderState.h#L103""" - if any(req.py_return_log_probs for req in requests): + if any(req.py_return_log_probs + for req in scheduled_requests.all_requests()): return torch.empty( (self.max_num_sequences, self.MAX_BEAM_WIDTH, self.max_tokens), device="cpu", pin_memory=True) return None - def gen_logits_host(self, requests: Iterable[LlmRequest], vocab_size: int): - if any(req.py_return_generation_logits for req in requests): - return torch.empty((self.max_tokens, self.max_num_sequences, - self.MAX_BEAM_WIDTH, vocab_size), - device="cpu", - pin_memory=True) - return None - def sample_async(self, scheduled_requests: ScheduledRequests, - model_outputs: dict[str, torch.Tensor]) -> SampleState: - requests = scheduled_requests.all_requests() + model_outputs: dict[str, torch.Tensor], + num_context_logits_prefix_sum: list[int]) -> SampleState: new_tokens = self.store.new_tokens - vocab_size = model_outputs["logits"].shape[-1] - log_probs_host = self.log_probs_host(requests) - gen_logits_host = self.gen_logits_host(requests, vocab_size) - self._process_requests(requests, + log_probs_host = self.log_probs_host(scheduled_requests) + self._process_requests(scheduled_requests, model_outputs, new_tokens, - gen_logits_host=gen_logits_host, + num_context_logits_prefix_sum, log_probs_host=log_probs_host) new_tokens_host = new_tokens.to(device="cpu", non_blocking=True) sampler_event = torch.cuda.Event() @@ -596,8 +593,7 @@ class TorchSampler(Sampler): return SampleState(scheduled_requests=scheduled_requests, device=SampleStateTensors(new_tokens=new_tokens), host=SampleStateTensors(new_tokens=new_tokens_host, - log_probs=log_probs_host, - logits=gen_logits_host), + log_probs=log_probs_host), sampler_event=sampler_event) @staticmethod @@ -659,19 +655,37 @@ class TorchSampler(Sampler): return logits def _process_requests(self, - requests: list[LlmRequest], + scheduled_requests: ScheduledRequests, model_outputs: dict[str, torch.Tensor], new_tokens: torch.Tensor, + num_context_logits_prefix_sum: list[int], *, - gen_logits_host: torch.Tensor | None = None, log_probs_host: torch.Tensor | None = None): beam_width = self.MAX_BEAM_WIDTH beam = self.BEAM - raw_logits = model_outputs["logits"] + + # raw_logits should contain only the logits from the gen requests. + # If return context logits is requested, fetch only the logits from gen requests. + if any(r.py_return_context_logits + for r in scheduled_requests.context_requests): + gen_logits_indices = [] + total_context_logits = num_context_logits_prefix_sum[-1] + for i in range(len(scheduled_requests.context_requests)): + gen_logits_indices.append(num_context_logits_prefix_sum[i + 1] - + 1) + gen_logits_indices.extend( + range( + total_context_logits, total_context_logits + + len(scheduled_requests.generation_requests))) + raw_logits = model_outputs["logits"][gen_logits_indices] + else: + raw_logits = model_outputs["logits"] + + requests = scheduled_requests.all_requests() num_steps = [1 + get_draft_token_length(req) for req in requests] sum_steps = sum(num_steps) no_draft_tokens = len(requests) == sum_steps - fast_path = not self.enable_mixed_sampler and no_draft_tokens and gen_logits_host is None and log_probs_host is None + fast_path = not self.enable_mixed_sampler and no_draft_tokens and log_probs_host is None seq_slots_host = torch.as_tensor([r.py_seq_slot for r in requests]) seq_slots = seq_slots_host.to(device="cuda", non_blocking=True) @@ -727,8 +741,6 @@ class TorchSampler(Sampler): new_tokens[current_slice] = next_tokens if request.py_draft_logits is not None: request.py_target_probs = softmax.clone() - if gen_logits_host is not None: - gen_logits_host[current_slice].copy_(logits, non_blocking=True) if log_probs_host is not None: assert beam == 0, "The following call relies on beam_width to be 1 - hence the unsqueeze" token_probs = torch.gather( @@ -769,6 +781,9 @@ class TRTLLMSampler(Sampler): MAX_DECODING_TOKENS = 1 # It must be 1 when not in speculative decoding SampleState = SampleStateTRTLLM + def is_generation_model(self) -> bool: + return True + def __init__( self, executor_config: ExecutorConfig, @@ -864,7 +879,6 @@ class TRTLLMSampler(Sampler): speculative_decoding_fast_logits=False, is_leader_in_orch_mode=False, is_normalize_log_probs=False) - self.algs.handle_logits = HandleLogits() self.algs.make_decoding_batch_input_output = MakeDecodingBatchInputOutput( ) @@ -898,13 +912,6 @@ class TRTLLMSampler(Sampler): slots = torch.tensor([r.py_seq_slot for r in adp], dtype=torch.int32) self.algs.decoder.underlying_decoder().setup(config, batch_size, slots) - @staticmethod - @torch.inference_mode() - def beam_width(scheduled_requests: Iterable[LlmRequest]) -> int: - for req in scheduled_requests: - return req.sampling_config.beam_width - return 0 - def get_cache_indirection(self) -> torch.Tensor | None: return self.store["decoder_state"].cache_indirection_output @@ -920,8 +927,9 @@ class TRTLLMSampler(Sampler): @torch.inference_mode() @nvtx_range("sample_async") - def sample_async(self, scheduled_requests: ScheduledRequests, - model_outputs) -> SampleStateTRTLLM: + def sample_async( + self, scheduled_requests: ScheduledRequests, model_outputs, + num_context_logits_prefix_sum: list[int]) -> SampleStateTRTLLM: batch_size = scheduled_requests.batch_size beam_width = self.beam_width(scheduled_requests.all_requests()) @@ -934,29 +942,10 @@ class TRTLLMSampler(Sampler): self.setup_sampler_step(scheduled_requests) - num_context_logits_prefix_sum = [0] - prefix_sum = 0 - for request in scheduled_requests.context_requests: - prefix_sum += request.context_chunk_size if request.py_return_context_logits else 1 - num_context_logits_prefix_sum.append(prefix_sum) - - if any(r.py_return_context_logits or r.py_return_generation_logits - for r in scheduled_requests.all_requests()): - self.algs.handle_logits(scheduled_requests.context_requests, - scheduled_requests.generation_requests, - model_outputs["logits"], - num_context_logits_prefix_sum, - self.max_num_sequences, beam_width) - # For beam search, cache indirection needs to be updated if beam_width > 1: self._update_cache_indirection_buffer(scheduled_requests) - # TODO: Enable this back once nanobind is merged and/or llm request is a pure python object - # decoding_input = self.algs.make_decoding_batch_input_output( - # scheduled_requests, model_outputs["logits"], beam_width, - # num_context_logits_prefix_sum) - self.store["decoding_input"][ self.micro_batch_idx] = make_decoding_batch_input( scheduled_requests.context_requests, diff --git a/tensorrt_llm/_torch/speculative/model_drafter.py b/tensorrt_llm/_torch/speculative/model_drafter.py index 7f11142c3f..5d54f2f3be 100644 --- a/tensorrt_llm/_torch/speculative/model_drafter.py +++ b/tensorrt_llm/_torch/speculative/model_drafter.py @@ -9,6 +9,7 @@ from tensorrt_llm._utils import nvtx_range from tensorrt_llm.logger import logger from ..pyexecutor.guided_decoder import GuidedDecoder +from ..pyexecutor.handle_logits import HandleLogits from ..pyexecutor.llm_request import (LlmRequest, LlmRequestState, get_draft_token_length) from ..pyexecutor.resource_manager import BaseResourceManager, ResourceManager @@ -266,7 +267,21 @@ class ModelDrafter(Drafter): """Sample tokens from draft model outputs.""" try: if self.sampler is not None: - return self.sampler.sample_async(draft_batch, outputs) + num_context_logits_prefix_sum = [0] + prefix_sum = 0 + for request in draft_batch.context_requests: + prefix_sum += request.context_chunk_size if request.py_return_context_logits else 1 + num_context_logits_prefix_sum.append(prefix_sum) + + HandleLogits()( + draft_batch.context_requests, + draft_batch.generation_requests, outputs["logits"], + self.sampler.beam_width(draft_batch.all_requests()), + num_context_logits_prefix_sum, + self.sampler.is_generation_model()) + + return self.sampler.sample_async(draft_batch, outputs, + num_context_logits_prefix_sum) return None except Exception as e: logger.error(f"Error in sampling: {str(e)}") diff --git a/tensorrt_llm/_torch/speculative/mtp.py b/tensorrt_llm/_torch/speculative/mtp.py index 2658ce539b..b31512df91 100644 --- a/tensorrt_llm/_torch/speculative/mtp.py +++ b/tensorrt_llm/_torch/speculative/mtp.py @@ -268,8 +268,10 @@ class MTPSampler(TorchSampler): req.py_rewind_len = self.draft_len - (num_new_tokens - 1) self._request_common_handling(req, next_draft_tokens_list) - def sample_async(self, scheduled_requests: ScheduledRequests, - outputs: dict[str, torch.Tensor]) -> SampleStateMTP: + def sample_async( + self, scheduled_requests: ScheduledRequests, + outputs: dict[str, torch.Tensor], + num_context_logits_prefix_sum: list[int]) -> SampleStateMTP: # new_tokens_device: accepted tokens, device tensor, shape: batch_size, nextn + 1 # new_tokens_lens_device: accepted lengths, device tensor, shape: batch_size # next_draft_tokens_device: predicted draft tokens, device tensor, shape: batch_size, nextn diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 618feaf928..0390c97e64 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -15,6 +15,7 @@ import os import pytest +import torch from defs.conftest import get_sm_version from tensorrt_llm import LLM @@ -398,6 +399,40 @@ class TestLlama3_2_1B(LlmapiAccuracyTestHarness): task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) + @skip_pre_hopper + @pytest.mark.skip_less_device(4) + @pytest.mark.parametrize("disable_overlap_scheduler", [True, False]) + @pytest.mark.parametrize("pp_size", [2, 4], ids=["pp2", "pp4"]) + def test_return_logits_pp(self, pp_size, disable_overlap_scheduler): + prompts = ["A B C"] + + llm = LLM(model=self.MODEL_PATH, + pipeline_parallel_size=pp_size, + disable_overlap_scheduler=disable_overlap_scheduler) + + sampling_params = SamplingParams(max_tokens=8, + return_context_logits=True, + return_generation_logits=True, + logprobs=True) + + with llm: + for output in llm.generate(prompts, + sampling_params=sampling_params): + assert output.context_logits is not None + # NOTE: prompt_token_ids of "A B C" becomes [1, 319, 350, 315] + expected_len = len(prompts[0].split()) + 1 + assert expected_len == output.context_logits.shape[0] + + gen_logits = output.outputs[0].generation_logits + assert gen_logits is not None + assert gen_logits.ndim == 2 + assert gen_logits.shape[0] == sampling_params.max_tokens + assert torch.argmax( + gen_logits, dim=1).tolist() == output.outputs[0].token_ids + + assert len( + output.outputs[0].logprobs) == sampling_params.max_tokens + class TestLlama3_2_3B(LlmapiAccuracyTestHarness): MODEL_NAME = "meta-llama/Llama-3.2-3B" diff --git a/tests/unittest/_torch/sampler/test_return_logits.py b/tests/unittest/_torch/sampler/test_return_logits.py index 0d6a5e28ca..a3af16c8bc 100644 --- a/tests/unittest/_torch/sampler/test_return_logits.py +++ b/tests/unittest/_torch/sampler/test_return_logits.py @@ -27,9 +27,6 @@ def test_generate_with_return_logits(disable_overlap_scheduler: bool, or return_log_probs): # prune space pytest.skip("Nothing to test") - if sampler_type == "TorchSampler" and gather_context_logits: - pytest.skip("TorchSampler does not support gather_context_logits") - build_config = BuildConfig() build_config.gather_context_logits = gather_context_logits @@ -94,9 +91,6 @@ def test_generate_async_with_return_logits(disable_overlap_scheduler: bool, or return_log_probs): # prune space pytest.skip("Nothing to test") - if sampler_type == "TorchSampler" and gather_context_logits: - pytest.skip("TorchSampler does not support gather_context_logits") - build_config = BuildConfig() build_config.gather_context_logits = gather_context_logits From a49cf684f8cb55c5c29ff276da4ab7eb65ad685d Mon Sep 17 00:00:00 2001 From: Emma Qiao Date: Fri, 22 Aug 2025 15:12:02 +0800 Subject: [PATCH 22/33] [TRTLLM-5801][infra] Add more RTX Pro 6000 test stages (#5126) Signed-off-by: qqiao --- jenkins/L0_Test.groovy | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index e99081d1c8..28c5c182fb 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -646,8 +646,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod def driverVersion = Constants.DEFAULT_NVIDIA_DRIVER_VERSION def cpuCount = "${TESTER_CORES}" - // Multi-GPU only supports DGX-H100 and DGX-H200 due to the hardware stability. - if ((type.contains("dgx-h100") || type.contains("dgx-h200")) && hasMultipleGPUs) + if (hasMultipleGPUs) { // Not a hard requirement, but based on empirical values. memorySize = "${gpuCount * 150}" + "Gi" @@ -661,7 +660,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod targetCould = "kubernetes" // The following GPU types doesn't support dynamic driver flashing. - if (type.contains("dgx-h100") || type.contains("dgx-h200") || type in ["b100-ts2", "gh200", "rtx-5080", "rtx-5090"]) { + if (type.contains("dgx-h100") || type.contains("dgx-h200") || type.contains("rtx-pro-6000") || type in ["b100-ts2", "gh200", "rtx-5080", "rtx-5090"]) { selectors = """ kubernetes.io/arch: ${arch} kubernetes.io/os: linux @@ -1270,6 +1269,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO echoNodeAndGpuInfo(pipeline, stageName) sh "cat ${MODEL_CACHE_DIR}/README" sh "nvidia-smi -q" + sh "nvidia-smi topo -m" sh "df -h" // setup HF_HOME to cache model and datasets @@ -1843,6 +1843,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) "DGX_H200-4_GPUs-TensorRT-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 3, 4], "DGX_H200-4_GPUs-TensorRT-Post-Merge-2": ["dgx-h200-x4", "l0_dgx_h200", 2, 3, 4], "DGX_H200-4_GPUs-TensorRT-Post-Merge-3": ["dgx-h200-x4", "l0_dgx_h200", 3, 3, 4], + "RTXPro6000-Pytorch-Post-Merge-1": ["rtx-pro-6000", "l0_rtx_pro_6000", 1, 1], + "RTXPro6000-4_GPUs-Pytorch-Post-Merge-1": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 1, 2, 4], + "RTXPro6000-4_GPUs-Pytorch-Post-Merge-2": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 2, 2, 4], ] parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), { @@ -1861,7 +1864,6 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) fullSet = parallelJobs.keySet() x86SlurmTestConfigs = [ - "RTXPro6000-PyTorch-Post-Merge-1": ["rtx-pro-6000", "l0_rtx_pro_6000", 1, 1], "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4], ] fullSet += x86SlurmTestConfigs.keySet() @@ -2456,7 +2458,7 @@ pipeline { def testPhase2StageName = env.testPhase2StageName if (testPhase2StageName) { - def dgxSigns = ["DGX_H100", "DGX_H200", "GB200", "DGX_B200"] + def dgxSigns = ["DGX_H100", "DGX_H200", "GB200", "DGX_B200", "RTXPro6000-4_GPUs"] singleGpuJobs = parallelJobs.findAll{!dgxSigns.any{sign -> it.key.contains(sign)}} dgxJobs = parallelJobs.findAll{dgxSigns.any{sign -> it.key.contains(sign)}} } From 898f37faa0d29f069fa9e3c91cd75f4dd8c167b1 Mon Sep 17 00:00:00 2001 From: Linda <57756729+Linda-Stadter@users.noreply.github.com> Date: Fri, 22 Aug 2025 09:48:41 +0200 Subject: [PATCH 23/33] [None][feat] Enable nanobind as the default binding library (#6608) Signed-off-by: Linda-Stadter <57756729+Linda-Stadter@users.noreply.github.com> --- cpp/CMakeLists.txt | 2 +- cpp/tensorrt_llm/nanobind/CMakeLists.txt | 2 +- .../nanobind/common/customCasters.h | 30 +++++++++++++++++++ .../nanobind/executor/bindings.cpp | 3 +- .../nanobind/executor/request.cpp | 1 + .../nanobind/runtime/bindings.cpp | 2 +- cpp/tensorrt_llm/pybind/CMakeLists.txt | 2 +- jenkins/Build.groovy | 20 ++++++------- jenkins/L0_Test.groovy | 10 +++---- scripts/build_wheel.py | 6 ++-- .../integration/test_lists/test-db/l0_a10.yml | 5 ++-- 11 files changed, 58 insertions(+), 25 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 4a8c8e9267..79c533fb5b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -69,7 +69,7 @@ add_compile_definitions("TLLM_GEN_EXPORT_INTERFACE") add_compile_definitions("TLLM_ENABLE_CUDA") set(BINDING_TYPE - "pybind" + "nanobind" CACHE STRING "Binding type of Python bindings for C++ runtime and batch manager") diff --git a/cpp/tensorrt_llm/nanobind/CMakeLists.txt b/cpp/tensorrt_llm/nanobind/CMakeLists.txt index 1ccb50a02b..8c62584108 100755 --- a/cpp/tensorrt_llm/nanobind/CMakeLists.txt +++ b/cpp/tensorrt_llm/nanobind/CMakeLists.txt @@ -43,7 +43,7 @@ target_link_libraries( ${Python3_LIBRARIES} ${TORCH_LIBRARIES} torch_python - CUDA::cuda_driver + ${CUDA_DRV_LIB} ${CUDA_NVML_LIB} th_common) target_compile_definitions( diff --git a/cpp/tensorrt_llm/nanobind/common/customCasters.h b/cpp/tensorrt_llm/nanobind/common/customCasters.h index a77a3bcb5a..432ce5c26b 100644 --- a/cpp/tensorrt_llm/nanobind/common/customCasters.h +++ b/cpp/tensorrt_llm/nanobind/common/customCasters.h @@ -285,5 +285,35 @@ struct type_caster>> return make_caster>::from_cpp(result, policy, cleanup); } }; + +template <> +struct type_caster +{ + NB_TYPE_CASTER(torch::ScalarType, const_name("torch.dtype")); + + bool from_python(handle src, uint8_t flags, cleanup_list* cleanup) noexcept + { + std::string dtype_name = nb::cast(nb::str(src)); + if (dtype_name.substr(0, 6) == "torch.") + { + dtype_name = dtype_name.substr(6); + } + + auto const& dtype_map = c10::getStringToDtypeMap(); + auto it = dtype_map.find(dtype_name); + if (it != dtype_map.end()) + { + value = it->second; + return true; + } + + return false; + } + + static handle from_cpp(torch::ScalarType src, rv_policy policy, cleanup_list* cleanup) + { + throw std::runtime_error("from_cpp for torch::ScalarType is not implemented"); + } +}; } // namespace detail } // namespace NB_NAMESPACE diff --git a/cpp/tensorrt_llm/nanobind/executor/bindings.cpp b/cpp/tensorrt_llm/nanobind/executor/bindings.cpp index d3f482df89..ae4936a4df 100644 --- a/cpp/tensorrt_llm/nanobind/executor/bindings.cpp +++ b/cpp/tensorrt_llm/nanobind/executor/bindings.cpp @@ -240,7 +240,8 @@ void initBindings(nb::module_& m) nb::class_(executor_kv_cache, "KVCacheEvent") .def_ro("event_id", &tle::KVCacheEvent::eventId) .def_ro("data", &tle::KVCacheEvent::data) - .def_ro("window_size", &tle::KVCacheEvent::windowSize); + .def_ro("window_size", &tle::KVCacheEvent::windowSize) + .def_ro("attention_dp_rank", &tle::KVCacheEvent::attentionDpRank); nb::class_(executor_kv_cache, "KVCacheEventManager") .def( diff --git a/cpp/tensorrt_llm/nanobind/executor/request.cpp b/cpp/tensorrt_llm/nanobind/executor/request.cpp index 1949474a10..e56341b53e 100644 --- a/cpp/tensorrt_llm/nanobind/executor/request.cpp +++ b/cpp/tensorrt_llm/nanobind/executor/request.cpp @@ -27,6 +27,7 @@ #include #include +#include #include #include #include diff --git a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp index a22a62bf80..47be92e13f 100644 --- a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp +++ b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp @@ -279,7 +279,7 @@ void initBindings(nb::module_& m) .def(nb::init(), nb::arg("stream")) .def("setup", &tr::GptDecoderBatched::setup, nb::arg("mode"), nb::arg("max_num_sequences"), nb::arg("max_beam_width"), nb::arg("dtype"), nb::arg("model_config"), nb::arg("world_config")) - .def("forward_async", &tr::GptDecoderBatched::forwardAsync, nb::arg("output"), nb::arg("input")) + .def("forward_async", &tr::GptDecoderBatched::forwardAsync, nb::arg("decoder_state"), nb::arg("input")) .def("underlying_decoder", &tr::GptDecoderBatched::getUnderlyingDecoder, nb::rv_policy::reference) .def("finalize", &tr::GptDecoderBatched::finalize, nb::arg("decoder_state"), nb::arg("batch_idx"), nb::arg("sampling_config"), nb::arg("streaming")) diff --git a/cpp/tensorrt_llm/pybind/CMakeLists.txt b/cpp/tensorrt_llm/pybind/CMakeLists.txt index 91b5ebf548..9d758b427c 100755 --- a/cpp/tensorrt_llm/pybind/CMakeLists.txt +++ b/cpp/tensorrt_llm/pybind/CMakeLists.txt @@ -44,7 +44,7 @@ target_link_libraries( ${Python3_LIBRARIES} ${TORCH_LIBRARIES} torch_python - CUDA::cuda_driver + ${CUDA_DRV_LIB} ${CUDA_NVML_LIB} th_common) target_compile_definitions( diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy index d689de393b..3c4d8e4543 100644 --- a/jenkins/Build.groovy +++ b/jenkins/Build.groovy @@ -48,10 +48,10 @@ CONFIG_LINUX_AARCH64 = "linux_aarch64" def CONFIG_LINUX_AARCH64_LLVM = "linux_aarch64_LLVM" @Field -def CONFIG_LINUX_X86_64_NANOBIND = "linux_x86_64_Nanobind" +def CONFIG_LINUX_X86_64_PYBIND = "linux_x86_64_Pybind" @Field -def CONFIG_LINUX_AARCH64_NANOBIND = "linux_aarch64_Nanobind" +def CONFIG_LINUX_AARCH64_PYBIND = "linux_aarch64_Pybind" @Field def BUILD_CONFIGS = [ @@ -62,9 +62,9 @@ def BUILD_CONFIGS = [ (TARNAME) : "TensorRT-LLM.tar.gz", (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;120-real", ], - (CONFIG_LINUX_X86_64_NANOBIND) : [ - (WHEEL_EXTRA_ARGS) : "--binding_type nanobind --extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --micro_benchmarks", - (TARNAME) : "nanobind-TensorRT-LLM.tar.gz", + (CONFIG_LINUX_X86_64_PYBIND) : [ + (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --micro_benchmarks", + (TARNAME) : "pybind-TensorRT-LLM.tar.gz", (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;120-real", ], (CONFIG_LINUX_X86_64_SINGLE_DEVICE) : [ @@ -82,9 +82,9 @@ def BUILD_CONFIGS = [ (TARNAME) : "TensorRT-LLM-GH200.tar.gz", (WHEEL_ARCHS): "90-real;100-real;120-real", ], - (CONFIG_LINUX_AARCH64_NANOBIND): [ - (WHEEL_EXTRA_ARGS) : "--binding_type nanobind --extra-cmake-vars WARNING_IS_ERROR=ON", - (TARNAME) : "nanobind-TensorRT-LLM-GH200.tar.gz", + (CONFIG_LINUX_AARCH64_PYBIND): [ + (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars WARNING_IS_ERROR=ON", + (TARNAME) : "pybind-TensorRT-LLM-GH200.tar.gz", (WHEEL_ARCHS): "90-real;100-real;120-real", ], (CONFIG_LINUX_AARCH64_LLVM) : [ @@ -542,8 +542,8 @@ def launchStages(pipeline, cpu_arch, enableFailFast, globalVars) pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64 : CONFIG_LINUX_X86_64_VANILLA), "Build TRT-LLM LLVM": [LLM_DOCKER_IMAGE] + prepareLLMBuild( pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_LLVM : CONFIG_LINUX_X86_64_LLVM), - "Build TRT-LLM Nanobind": [LLM_DOCKER_IMAGE] + prepareLLMBuild( - pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_NANOBIND : CONFIG_LINUX_X86_64_NANOBIND), + "Build TRT-LLM Pybind": [LLM_DOCKER_IMAGE] + prepareLLMBuild( + pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_PYBIND : CONFIG_LINUX_X86_64_PYBIND), ] if (cpu_arch == X86_64_TRIPLE) { diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 28c5c182fb..d57cc74d4e 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -65,7 +65,7 @@ def LLVM_CONFIG = "LLVM" LINUX_AARCH64_CONFIG = "linux_aarch64" @Field -def NANOBIND_CONFIG = "Nanobind" +def PYBIND_CONFIG = "Pybind" @Field def BUILD_CONFIGS = [ @@ -74,7 +74,7 @@ def BUILD_CONFIGS = [ (SINGLE_DEVICE_CONFIG) : [(TARNAME) : "single-device-TensorRT-LLM.tar.gz"], (LLVM_CONFIG) : [(TARNAME) : "llvm-TensorRT-LLM.tar.gz"], (LINUX_AARCH64_CONFIG) : [(TARNAME) : "TensorRT-LLM-GH200.tar.gz"], - (NANOBIND_CONFIG) : [(TARNAME) : "nanobind-TensorRT-LLM.tar.gz"], + (PYBIND_CONFIG) : [(TARNAME) : "pybind-TensorRT-LLM.tar.gz"], ] // TODO: Move common variables to an unified location @@ -1775,7 +1775,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) "A10-TensorRT-4": ["a10", "l0_a10", 4, 6], "A10-TensorRT-5": ["a10", "l0_a10", 5, 6], "A10-TensorRT-6": ["a10", "l0_a10", 6, 6], - "A10-Nanobind": ["a10", "l0_a10_nanobind", 1, 1], + "A10-Pybind": ["a10", "l0_a10_pybind", 1, 1], "A30-Triton-1": ["a30", "l0_a30", 1, 1], "A30-PyTorch-1": ["a30", "l0_a30", 1, 2], "A30-PyTorch-2": ["a30", "l0_a30", 2, 2], @@ -1856,8 +1856,8 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) if (key.contains("llvm")) { config = LLVM_CONFIG } - if (key.contains("Nanobind")) { - config = NANOBIND_CONFIG + if (key.contains("Pybind")) { + config = PYBIND_CONFIG } runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3]) }]]} diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py index d7cd4c61f1..a1275bf106 100755 --- a/scripts/build_wheel.py +++ b/scripts/build_wheel.py @@ -435,7 +435,7 @@ def main(*, install: bool = False, skip_building_wheel: bool = False, linking_install_binary: bool = False, - binding_type: str = "pybind", + binding_type: str = "nanobind", benchmarks: bool = False, micro_benchmarks: bool = False, nvtx: bool = False, @@ -984,8 +984,8 @@ def add_arguments(parser: ArgumentParser): ) parser.add_argument("--binding_type", choices=["pybind", "nanobind"], - default="pybind", - help="Which binding type to build: pybind or nanobind") + default="nanobind", + help="Which binding library to use: pybind or nanobind") parser.add_argument("--benchmarks", action="store_true", help="Build the benchmarks for the C++ runtime") diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml index ce285faa79..30fc6c05b5 100644 --- a/tests/integration/test_lists/test-db/l0_a10.yml +++ b/tests/integration/test_lists/test-db/l0_a10.yml @@ -199,7 +199,7 @@ l0_a10: tests: - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test] - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test] -l0_a10_nanobind: +l0_a10_pybind: - condition: ranges: system_gpu_count: @@ -211,6 +211,7 @@ l0_a10_nanobind: linux_distribution_name: ubuntu* terms: stage: pre_merge - backend: tensorrt tests: - unittest/bindings + - test_e2e.py::test_openai_chat_example[trt] + - test_e2e.py::test_openai_chat_example[pytorch] TIMEOUT (90) From d94cc3fa3c8e776807835c9fea44a1e0f17038b9 Mon Sep 17 00:00:00 2001 From: dongfengy <99041270+dongfengy@users.noreply.github.com> Date: Fri, 22 Aug 2025 01:17:01 -0700 Subject: [PATCH 24/33] [TRTLLM-7321][doc] Add GPT-OSS Deployment Guide into official doc site (#7143) Signed-off-by: Dongfeng Yu --- ...uick-start-recipe-for-gpt-oss-on-trtllm.md | 328 ++++++++++++++++++ docs/source/index.rst | 1 + 2 files changed, 329 insertions(+) create mode 100644 docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md diff --git a/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md b/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md new file mode 100644 index 0000000000..b201deb8f4 --- /dev/null +++ b/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md @@ -0,0 +1,328 @@ +# Quick Start Recipe for GPT-OSS on TensorRT-LLM - Blackwell Hardware + +## Introduction + +This deployment guide provides step-by-step instructions for running the GPT-OSS model using TensorRT-LLM, optimized for NVIDIA GPUs. It covers the complete setup required; from accessing model weights and preparing the software environment to configuring TensorRT-LLM parameters, launching the server, and validating inference output. + +The guide is intended for developers and practitioners seeking high-throughput or low-latency inference using NVIDIA’s accelerated stack—starting with the PyTorch container from NGC, then installing TensorRT-LLM for model serving. + +## Prerequisites + +* GPU: NVIDIA Blackwell Architecture +* OS: Linux +* Drivers: CUDA Driver 575 or Later +* Docker with NVIDIA Container Toolkit installed +* Python3 and python3-pip (Optional, for accuracy evaluation only) + +## Models + +* MXFP4 model: [GPT-OSS-120B](https://huggingface.co/openai/gpt-oss-120b) + + +## MoE Backend Support Matrix + +There are multiple MOE backends inside TRT-LLM. Here are the support matrix of the MOE backends. + +| Device | Activation Type | MoE Weights Type | MoE Backend | Use Case | +|------------|------------------|------------------|-------------|----------------| +| B200/GB200 | MXFP8 | MXFP4 | TRTLLM | Low Latency | +| B200/GB200 | MXFP8 | MXFP4 | CUTLASS | Max Throughput | + +The default moe backend is `CUTLASS`, so for the combination which is not supported by `CUTLASS`, one must set the `moe_config.backend` explicitly to run the model. + +## Deployment Steps + +### Run Docker Container + +Run the docker container using the TensorRT-LLM NVIDIA NGC image. + +```shell +docker run --rm -it \ +--ipc=host \ +--gpus all \ +-p 8000:8000 \ +-v ~/.cache:/root/.cache:rw \ +--name tensorrt_llm \ +nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc6 \ +/bin/bash +``` + +Note: + +* The command mounts your user `.cache` directory to save the downloaded model checkpoints which are saved to `~/.cache/huggingface/hub/` by default. This prevents having to redownload the weights each time you rerun the container. If the `~/.cache` directory doesn’t exist please create it using `$ mkdir ~/.cache`. +* You can mount additional directories and paths using the `-v :` flag if needed, such as mounting the downloaded weight paths. +* The command also maps port `8000` from the container to your host so you can access the LLM API endpoint from your host +* See the for all the available containers. The containers published in the main branch weekly have `rcN` suffix, while the monthly release with QA tests has no `rcN` suffix. Use the `rc` release to get the latest model and feature support. + +If you want to use latest main branch, you can choose to build from source to install TensorRT-LLM, the steps refer to . + +### Creating the TRT-LLM Server config + +We create a YAML configuration file `/tmp/config.yml` for the TensorRT-LLM Server and populate it with the following recommended performance settings. + +For low-latency with `TRTLLM` MOE backend: + +```shell +EXTRA_LLM_API_FILE=/tmp/config.yml + +cat << EOF > ${EXTRA_LLM_API_FILE} +enable_attention_dp: false +cuda_graph_config: + enable_padding: true + max_batch_size: 128 +moe_config: + backend: TRTLLM +EOF +``` + +For max-throughput with `CUTLASS` MOE backend: + +```shell +EXTRA_LLM_API_FILE=/tmp/config.yml + +cat << EOF > ${EXTRA_LLM_API_FILE} +enable_attention_dp: true +cuda_graph_config: + enable_padding: true + max_batch_size: 128 +moe_config: + backend: CUTLASS +EOF +``` + +### Launch the TRT-LLM Server + +Below is an example command to launch the TRT-LLM server with the GPT-OSS model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “Configs and Parameters” section. + +```shell +trtllm-serve openai/gpt-oss-120b \ + --host 0.0.0.0 \ + --port 8000 \ + --backend pytorch \ + --max_batch_size 128 \ + --max_num_tokens 16384 \ + --max_seq_len 2048 \ + --kv_cache_free_gpu_memory_fraction 0.9 \ + --tp_size 8 \ + --ep_size 8 \ + --trust_remote_code \ + --extra_llm_api_options ${EXTRA_LLM_API_FILE} +``` + +After the server is set up, the client can now send prompt requests to the server and receive results. + +### Configs and Parameters + +These options are used directly on the command line when you start the `trtllm-serve` process. + +#### `--tp_size` + +* **Description:** Sets the **tensor-parallel size**. This should typically match the number of GPUs you intend to use for a single model instance. + +#### `--ep_size` + +* **Description:** Sets the **expert-parallel size** for Mixture-of-Experts (MoE) models. Like `tp_size`, this should generally match the number of GPUs you're using. This setting has no effect on non-MoE models. + +#### `--kv_cache_free_gpu_memory_fraction` + +* **Description:** A value between `0.0` and `1.0` that specifies the fraction of free GPU memory to reserve for the KV cache after the model is loaded. Since memory usage can fluctuate, this buffer helps prevent out-of-memory (OOM) errors. +* **Recommendation:** If you experience OOM errors, try reducing this value to `0.7` or lower. + +#### `--backend pytorch` + +* **Description:** Tells TensorRT-LLM to use the **pytorch** backend. + +#### `--max_batch_size` + +* **Description:** The maximum number of user requests that can be grouped into a single batch for processing. + +#### `--max_num_tokens` + +* **Description:** The maximum total number of tokens (across all requests) allowed inside a single scheduled batch. + +#### `--max_seq_len` + +* **Description:** The maximum possible sequence length for a single request, including both input and generated output tokens. + +#### `--trust_remote_code` + +* **Description:** Allows TensorRT-LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API. + + +#### Extra LLM API Options (YAML Configuration) + +These options provide finer control over performance and are set within a YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument. + +#### `cuda_graph_config` + +* **Description**: A section for configuring CUDA graphs to optimize performance. + +* **Options**: + + * `enable_padding`: If `"true"`, input batches are padded to the nearest `cuda_graph_batch_size`. This can significantly improve performance. + + **Default**: `false` + + * `max_batch_size`: Sets the maximum batch size for which a CUDA graph will be created. + + **Default**: `0` + + **Recommendation**: Set this to the same value as the `--max_batch_size` command-line option. + +#### `moe_config` + +* **Description**: Configuration for Mixture-of-Experts (MoE) models. + +* **Options**: + + * `backend`: The backend to use for MoE operations. + **Default**: `CUTLASS` + +See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`. + +## Testing API Endpoint + +### Basic Test + +Start a new terminal on the host to test the TensorRT-LLM server you just launched. + +You can query the health/readiness of the server using: + +```shell +curl -s -o /dev/null -w "Status: %{http_code}\n" "http://localhost:8000/health" +``` + +When the `Status: 200` code is returned, the server is ready for queries. Note that the very first query may take longer due to initialization and compilation. + +After the TRT-LLM server is set up and shows Application startup complete, you can send requests to the server. + +```shell +curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "openai/gpt-oss-120b", + "messages": [ + { + "role": "user", + "content": "Where is New York?" + } + ], + "max_tokens": 1024, + "top_p": 1.0 +}' -w "\n" +``` + +Here is an example response, showing that the TRT-LLM server reasons and answers the questions. + +TODO: Use Chat Compeletions API / Responses API as the example after the PR is merged. + +```json +{"id":"chatcmpl-c5bf51b5cab94e10ba5da5266d12ee59","object":"chat.completion","created":1755815898,"model":"openai/gpt-oss-120b","choices":[{"index":0,"message":{"role":"assistant","content":"analysisThe user asks: \"Where is New York?\" Likely they want location info. Provide answer: New York State in northeastern US, New York City on the east coast, coordinates, etc. Provide context.assistantfinal**New York** can refer to two related places in the United States:\n\n| What it is | Where it is | Approx. coordinates | How to picture it |\n|------------|------------|--------------------|-------------------|\n| **New York State** | The northeastern corner of the United States, bordered by **Vermont, Massachusetts, Connecticut, New Jersey, Pennsylvania, and the Canadian provinces of Ontario and Quebec**. | 42.7° N, 75.5° W (roughly the state’s geographic centre) | A roughly rectangular state that stretches from the Atlantic Ocean in the southeast to the Adirondack Mountains and the Great Lakes region in the north. |\n| **New York City (NYC)** | The largest city in the state, located on the **southern tip of the state** where the **Hudson River meets the Atlantic Ocean**. It occupies five boroughs: Manhattan, Brooklyn, Queens, The Bronx, and Staten Island. | 40.7128° N, 74.0060° W | A dense, world‑famous metropolis that sits on a series of islands (Manhattan, Staten Island, parts of the Bronx) and the mainland (Brooklyn and Queens). |\n\n### Quick geographic context\n- **On a map of the United States:** New York State is in the **Northeast** region, just east of the Great Lakes and north of Pennsylvania. \n- **From Washington, D.C.:** Travel roughly **225 mi (360 km) northeast**. \n- **From Boston, MA:** Travel about **215 mi (350 km) southwest**. \n- **From Toronto, Canada:** Travel about **500 mi (800 km) southeast**.\n\n### Travel tips\n- **By air:** Major airports include **John F. Kennedy International (JFK)**, **LaGuardia (LGA)**, and **Newark Liberty International (EWR)** (the latter is actually in New Jersey but serves the NYC metro area). \n- **By train:** Amtrak’s **Northeast Corridor** runs from **Boston → New York City → Washington, D.C.** \n- **By car:** Interstates **I‑87** (north‑south) and **I‑90** (east‑west) are the primary highways crossing the state.\n\n### Fun fact\n- The name “**New York**” was given by the English in 1664, honoring the Duke of York (later King James II). The city’s original Dutch name was **“New Amsterdam.”**\n\nIf you need more specific directions (e.g., how to get to a particular neighborhood, landmark, or the state capital **Albany**), just let me know!","reasoning_content":null,"tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null,"mm_embedding_handle":null,"disaggregated_params":null,"avg_decoded_tokens_per_iter":1.0}],"usage":{"prompt_tokens":72,"total_tokens":705,"completion_tokens":633},"prompt_token_ids":null} +``` + +### Troubleshooting Tips + +* If you encounter CUDA out-of-memory errors, try reducing `max_batch_size` or `max_seq_len`. +* Ensure your model checkpoints are compatible with the expected format. +* For performance issues, check GPU utilization with nvidia-smi while the server is running. +* If the container fails to start, verify that the NVIDIA Container Toolkit is properly installed. +* For connection issues, make sure the server port (`8000` in this guide) is not being used by another application. + +### Running Evaluations to Verify Accuracy (Optional) + +We use OpenAI's official evaluation tool to test the model's accuracy. For more information see [https://github.com/openai/gpt-oss/tree/main/gpt_oss/evals](gpt-oss-eval). + +TODO(@Binghan Chen): Add instructions for running gpt-oss-eval. + +## Benchmarking Performance + +To benchmark the performance of your TensorRT-LLM server you can leverage the built-in `benchmark_serving.py` script. To do this first creating a wrapper `bench.sh` script. + +```shell +cat <<'EOF' > bench.sh +#!/usr/bin/env bash +set -euo pipefail + +concurrency_list="32 64 128 256 512 1024 2048 4096" +multi_round=5 +isl=1024 +osl=1024 +result_dir=/tmp/gpt_oss_output + +for concurrency in ${concurrency_list}; do + num_prompts=$((concurrency * multi_round)) + python -m tensorrt_llm.serve.scripts.benchmark_serving \ + --model openai/gpt-oss-120b \ + --backend openai \ + --dataset-name "random" \ + --random-input-len ${isl} \ + --random-output-len ${osl} \ + --random-prefix-len 0 \ + --random-ids \ + --num-prompts ${num_prompts} \ + --max-concurrency ${concurrency} \ + --ignore-eos \ + --tokenize-on-client \ + --percentile-metrics "ttft,tpot,itl,e2el" +done +EOF +chmod +x bench.sh +``` + +If you want to save the results to a file add the following options. + +```shell +--save-result \ +--result-dir "${result_dir}" \ +--result-filename "concurrency_${concurrency}.json" +``` + +For more benchmarking options see . + +Run `bench.sh` to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above `bench.sh` script. + +```shell +./bench.sh +``` + +Sample TensorRT-LLM serving benchmark output. Your results may vary due to ongoing software optimizations. + +``` +============ Serving Benchmark Result ============ +Successful requests: 16 +Benchmark duration (s): 17.66 +Total input tokens: 16384 +Total generated tokens: 16384 +Request throughput (req/s): [result] +Output token throughput (tok/s): [result] +Total Token throughput (tok/s): [result] +User throughput (tok/s): [result] +---------------Time to First Token---------------- +Mean TTFT (ms): [result] +Median TTFT (ms): [result] +P99 TTFT (ms): [result] +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): [result] +Median TPOT (ms): [result] +P99 TPOT (ms): [result] +---------------Inter-token Latency---------------- +Mean ITL (ms): [result] +Median ITL (ms): [result] +P99 ITL (ms): [result] +----------------End-to-end Latency---------------- +Mean E2EL (ms): [result] +Median E2EL (ms): [result] +P99 E2EL (ms): [result] +================================================== +``` + +### Key Metrics + +* Median Time to First Token (TTFT) + * The typical time elapsed from when a request is sent until the first output token is generated. +* Median Time Per Output Token (TPOT) + * The typical time required to generate each token *after* the first one. +* Median Inter-Token Latency (ITL) + * The typical time delay between the completion of one token and the completion of the next. +* Median End-to-End Latency (E2EL) + * The typical total time from when a request is submitted until the final token of the response is received. +* Total Token Throughput + * The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens. diff --git a/docs/source/index.rst b/docs/source/index.rst index b0964ca287..df00acb02b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -38,6 +38,7 @@ Welcome to TensorRT-LLM's Documentation! deployment-guide/quick-start-recipe-for-llama4-scout-on-trtllm.md deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.md deployment-guide/quick-start-recipe-for-llama3.3-70b-on-trtllm.md + deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md .. toctree:: From b8b2bd4a0a1818ae666013382f5bea7d90388369 Mon Sep 17 00:00:00 2001 From: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> Date: Fri, 22 Aug 2025 17:17:27 +0800 Subject: [PATCH 25/33] [TRTLLM-7245][feat] add test_multi_nodes_eval tests (#7108) Signed-off-by: Xin He (SW-GPU) <200704525+xinhe-nv@users.noreply.github.com> --- tests/integration/defs/common.py | 20 ++++++++ tests/integration/defs/test_e2e.py | 47 +++++++++++++++++-- .../test_lists/qa/llm_function_multinode.txt | 13 +++-- tests/integration/test_lists/waives.txt | 1 + 4 files changed, 71 insertions(+), 10 deletions(-) diff --git a/tests/integration/defs/common.py b/tests/integration/defs/common.py index a61a5b8c28..7136bac02d 100644 --- a/tests/integration/defs/common.py +++ b/tests/integration/defs/common.py @@ -956,3 +956,23 @@ def get_dummy_spec_decoding_heads(hf_model_dir, export_hf_checkpoint(model, dtype=model.config.torch_dtype, export_dir=os.path.join(save_dir, 'fp8')) + + +def get_mmlu_accuracy(output): + mmlu_line = None + for line in output.split('\n'): + if "MMLU weighted average accuracy:" in line: + mmlu_line = line + break + + if mmlu_line is None: + raise Exception( + f"Could not find 'MMLU weighted average accuracy:' in output. Full output:\n{output}" + ) + + mmlu_accuracy = float( + mmlu_line.split("MMLU weighted average accuracy: ")[1].split(" (")[0]) + + print(f"MMLU weighted average accuracy is: {mmlu_accuracy}") + + return mmlu_accuracy diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index bb38e94aac..ef615843bb 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -28,8 +28,9 @@ from defs.common import convert_weights from defs.trt_test_alternative import (check_call, check_call_negative_test, check_output) -from .common import (PluginOptions, convert_weights, prune_checkpoint, - quantize_data, refit_model, venv_check_call) +from .common import (PluginOptions, convert_weights, get_mmlu_accuracy, + prune_checkpoint, quantize_data, refit_model, + venv_check_call) from .conftest import (llm_models_root, skip_no_sm120, skip_nvlink_inactive, skip_post_blackwell, skip_pre_blackwell, skip_pre_hopper, tests_path, unittest_path) @@ -42,6 +43,7 @@ if TEST_MEM_USAGE: os.environ['TLLM_LOG_LEVEL'] = 'INFO' _MEM_FRACTION_50 = 0.5 +_MEM_FRACTION_80 = 0.8 _MEM_FRACTION_95 = 0.95 @@ -2677,4 +2679,43 @@ def test_ptp_quickstart_advanced_llama_multi_nodes(llm_root, llm_venv, check_call(" ".join(run_cmd), shell=True, env=llm_venv._new_env) -# End of Pivot-To-Python examples +@pytest.mark.timeout(5400) +@pytest.mark.skip_less_device_memory(80000) +@pytest.mark.skip_less_device(4) +@pytest.mark.parametrize("eval_task", ["mmlu"]) +@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(16, 1, 8), (8, 2, 8)], + ids=["tp16", "tp8pp2"]) +@pytest.mark.parametrize("model_path", [ + pytest.param('llama-3.3-models/Llama-3.3-70B-Instruct', + marks=skip_pre_hopper), + pytest.param('llama4-models/Llama-4-Maverick-17B-128E-Instruct', + marks=skip_pre_hopper), + pytest.param('llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8', + marks=skip_pre_hopper), + pytest.param('Qwen3/Qwen3-235B-A22B', marks=skip_pre_hopper), + pytest.param('Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf', + marks=skip_pre_blackwell), + pytest.param('DeepSeek-R1/DeepSeek-R1-0528-FP4', marks=skip_pre_blackwell), +]) +def test_multi_nodes_eval(llm_venv, model_path, tp_size, pp_size, ep_size, + eval_task): + if "Llama-4" in model_path and tp_size == 16: + pytest.skip("Llama-4 with tp16 is not supported") + + mmlu_threshold = 81.5 + run_cmd = [ + "trtllm-llmapi-launch", + "trtllm-eval", + f"--model={llm_models_root()}/{model_path}", + f"--ep_size={ep_size}", + f"--tp_size={tp_size}", + f"--pp_size={pp_size}", + f"--kv_cache_free_gpu_memory_fraction={_MEM_FRACTION_80}", + "--max_batch_size=32", + eval_task, + ] + output = check_output(" ".join(run_cmd), shell=True, env=llm_venv._new_env) + + if os.environ.get("SLURM_PROCID", '0') == '0': + mmlu_accuracy = get_mmlu_accuracy(output) + assert mmlu_accuracy > mmlu_threshold, f"MMLU accuracy {mmlu_accuracy} is less than threshold {mmlu_threshold}" diff --git a/tests/integration/test_lists/qa/llm_function_multinode.txt b/tests/integration/test_lists/qa/llm_function_multinode.txt index 1348faa84b..06a3d4714b 100644 --- a/tests/integration/test_lists/qa/llm_function_multinode.txt +++ b/tests/integration/test_lists/qa/llm_function_multinode.txt @@ -1,9 +1,8 @@ -examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-8b-disable_fp8-tp16pp1-build] -examples/test_llama.py::test_llm_llama_v3_1_2nodes_8gpus[llama-3.1-8b-disable_fp8-tp16pp1-infer] -examples/test_mixtral.py::test_llm_mixtral_2nodes_8gpus[Mixtral-8x22B-v0.1-plugin-renormalize-tensor_parallel-build] -examples/test_mixtral.py::test_llm_mixtral_2nodes_8gpus[Mixtral-8x22B-v0.1-plugin-renormalize-tensor_parallel-infer] test_e2e.py::test_ptp_quickstart_advanced_deepseek_multi_nodes[DeepSeek-V3] -test_e2e.py::test_ptp_quickstart_advanced_deepseek_multi_nodes[DeepSeek-R1/DeepSeek-R1-0528-FP4] -test_e2e.py::test_ptp_quickstart_advanced_llama_multi_nodes[llama-3.3-models/Llama-3.3-70B-Instruct] -test_e2e.py::test_ptp_quickstart_advanced_llama_multi_nodes[llama4-models/Llama-4-Maverick-17B-128E-Instruct] test_e2e.py::test_openai_multinodes_chat_tp16pp1 +test_e2e.py::test_multi_nodes_eval[llama-3.3-models/Llama-3.3-70B-Instruct-tp16-mmlu] +test_e2e.py::test_multi_nodes_eval[llama4-models/Llama-4-Maverick-17B-128E-Instruct-tp8pp2-mmlu] +test_e2e.py::test_multi_nodes_eval[llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8-tp8pp2-mmlu] +test_e2e.py::test_multi_nodes_eval[Qwen3/Qwen3-235B-A22B-tp16-mmlu] +test_e2e.py::test_multi_nodes_eval[Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf-tp16-mmlu] +test_e2e.py::test_multi_nodes_eval[DeepSeek-R1/DeepSeek-R1-0528-FP4-tp16-mmlu] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 9d1a19250c..cc970b452f 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -324,3 +324,4 @@ accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_mo full:L40S/accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False] SKIP (https://nvbugs/5347051) full:L40S/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] SKIP (https://nvbugs/5471106) full:L40S/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp2] SKIP (https://nvbugs/5471108) +test_e2e.py::test_multi_nodes_eval[llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8-tp8pp2-mmlu] SKIP (https://nvbugs/5473781) From 1388e8479361641067db8bc52e77f2c0c13f8699 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 22 Aug 2025 18:47:23 +0800 Subject: [PATCH 26/33] [None][ci] move all B200 TensorRT test cases to post merge (#7165) Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- jenkins/L0_Test.groovy | 4 ++-- tests/integration/test_lists/test-db/l0_b200.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index d57cc74d4e..95f5a38e26 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1795,8 +1795,6 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) "B200_PCIe-PyTorch-1": ["b100-ts2", "l0_b200", 1, 3], "B200_PCIe-PyTorch-2": ["b100-ts2", "l0_b200", 2, 3], "B200_PCIe-PyTorch-3": ["b100-ts2", "l0_b200", 3, 3], - "B200_PCIe-TensorRT-1": ["b100-ts2", "l0_b200", 1, 2], - "B200_PCIe-TensorRT-2": ["b100-ts2", "l0_b200", 2, 2], "RTX5090-PyTorch-1": ["rtx-5090", "l0_gb202", 1, 1], "RTX5080-TensorRT-1": ["rtx-5080", "l0_gb203", 1, 2], "RTX5080-TensorRT-2": ["rtx-5080", "l0_gb203", 2, 2], @@ -1836,6 +1834,8 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) "H100_PCIe-TensorRT-Post-Merge-5": ["h100-cr", "l0_h100", 5, 5], "B200_PCIe-Triton-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1], "B200_PCIe-PyTorch-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1], + "B200_PCIe-TensorRT-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 2], + "B200_PCIe-TensorRT-Post-Merge-2": ["b100-ts2", "l0_b200", 2, 2], "H100_PCIe-TensorRT-Perf-1": ["h100-cr", "l0_perf", 1, 1], "H100_PCIe-PyTorch-Perf-1": ["h100-cr", "l0_perf", 1, 1], "DGX_H200-8_GPUs-PyTorch-Post-Merge-1": ["dgx-h200-x8", "l0_dgx_h200", 1, 1, 8], diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index ae0d0bd041..66cf676f2f 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -79,7 +79,7 @@ l0_b200: - '*b100*' linux_distribution_name: ubuntu* terms: - stage: pre_merge + stage: post_merge backend: tensorrt tests: # ------------- TRT tests --------------- From 907bc22fcb2f9ca2bc3d3d009c94a8bb3e6e4d1c Mon Sep 17 00:00:00 2001 From: Yiqing Yan Date: Fri, 22 Aug 2025 22:02:28 +0800 Subject: [PATCH 27/33] [None][chore] Bump version to 1.1.0rc2 (#7167) Signed-off-by: Yiqing Yan --- README.md | 2 +- examples/constraints.txt | 2 +- tensorrt_llm/version.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index def2a7cb5f..745713e581 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ TensorRT-LLM [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/) [![cuda](https://img.shields.io/badge/cuda-12.9.1-green)](https://developer.nvidia.com/cuda-downloads) [![trt](https://img.shields.io/badge/TRT-10.11.0-green)](https://developer.nvidia.com/tensorrt) -[![version](https://img.shields.io/badge/release-1.1.0rc1-green)](./tensorrt_llm/version.py) +[![version](https://img.shields.io/badge/release-1.1.0rc2-green)](./tensorrt_llm/version.py) [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE) [Architecture](./docs/source/torch/arch_overview.md)   |   [Performance](./docs/source/performance/perf-overview.md)   |   [Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)   |   [Documentation](./docs/source/)   |   [Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap) diff --git a/examples/constraints.txt b/examples/constraints.txt index 4ce23b0de7..8b0d1a0093 100644 --- a/examples/constraints.txt +++ b/examples/constraints.txt @@ -1,3 +1,3 @@ -tensorrt_llm==1.1.0rc1 +tensorrt_llm==1.1.0rc2 evaluate~=0.4.1 rouge_score~=0.1.2 diff --git a/tensorrt_llm/version.py b/tensorrt_llm/version.py index 603fd689b7..93b6027df5 100644 --- a/tensorrt_llm/version.py +++ b/tensorrt_llm/version.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.1.0rc1" +__version__ = "1.1.0rc2" From e3de5758a3d42d50aa15affd04d2568e0a53eb92 Mon Sep 17 00:00:00 2001 From: Suyog Gupta <41447211+suyoggupta@users.noreply.github.com> Date: Fri, 22 Aug 2025 08:30:53 -0700 Subject: [PATCH 28/33] [#7136][feat] trtllm-serve + autodeploy integration (#7141) Signed-off-by: Suyog Gupta <41447211+suyoggupta@users.noreply.github.com> --- .../advanced/serving_with_trtllm_serve.md | 77 +++++++++++++++++++ docs/source/torch/auto_deploy/auto-deploy.md | 1 + .../custom_ops/flashinfer_attention.py | 1 - tensorrt_llm/_torch/auto_deploy/llm_args.py | 10 +++ tensorrt_llm/commands/serve.py | 24 ++++-- .../defs/accuracy/test_llm_api_autodeploy.py | 2 + 6 files changed, 107 insertions(+), 8 deletions(-) create mode 100644 docs/source/torch/auto_deploy/advanced/serving_with_trtllm_serve.md diff --git a/docs/source/torch/auto_deploy/advanced/serving_with_trtllm_serve.md b/docs/source/torch/auto_deploy/advanced/serving_with_trtllm_serve.md new file mode 100644 index 0000000000..5a73d047ea --- /dev/null +++ b/docs/source/torch/auto_deploy/advanced/serving_with_trtllm_serve.md @@ -0,0 +1,77 @@ +# Serving with trtllm-serve + +AutoDeploy integrates with the OpenAI-compatible `trtllm-serve` CLI so you can expose AutoDeploy-optimized models over HTTP without writing server code. This page shows how to launch the server with the AutoDeploy backend, configure it via YAML, and validate with a simple request. + +## Quick start + +Launch `trtllm-serve` with the AutoDeploy backend by setting `--backend _autodeploy`: + +```bash +trtllm-serve \ + meta-llama/Llama-3.1-8B-Instruct \ + --backend _autodeploy +``` + +- `model`: HF name or local path +- `--backend _autodeploy`: uses AutoDeploy runtime + +Once the server is ready, test with an OpenAI-compatible request: + +```bash +curl -s http://localhost:8000/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "messages":[{"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Where is New York? Tell me in a single sentence."}], + "max_tokens": 32 + }' +``` + +## Configuration via YAML + +Use `--extra_llm_api_options` to supply a YAML file that augments or overrides server/runtime settings. + +```bash +trtllm-serve \ + meta-llama/Llama-3.1-8B \ + --backend _autodeploy \ + --extra_llm_api_options autodeploy_config.yaml +``` + +Example `autodeploy_config.yaml`: + +```yaml +# Compilation backend for AutoDeploy +compile_backend: torch-opt # options: torch-simple, torch-compile, torch-cudagraph, torch-opt + +# Runtime engine +runtime: trtllm # options: trtllm, demollm + +# Model loading +skip_loading_weights: false # set true for architecture-only perf runs + +# KV cache memory +free_mem_ratio: 0.8 # fraction of free GPU mem for KV cache + +# CUDA graph optimization +cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64] + +# Attention backend +attn_backend: flashinfer # recommended for best performance +``` + +## Limitations and tips + +- KV cache block reuse is disabled automatically for AutoDeploy backend +- AutoDeploy backend doesn't yet support disaggregated serving. WIP +- For best performance: + - Prefer `compile_backend: torch-opt` + - Use `attn_backend: flashinfer` + - Set realistic `cuda_graph_batch_sizes` that match expected traffic + - Tune `free_mem_ratio` to 0.8–0.9 + +## See also + +- [AutoDeploy overview](../auto-deploy.md) +- [Benchmarking with trtllm-bench](./benchmarking_with_trtllm_bench.md) diff --git a/docs/source/torch/auto_deploy/auto-deploy.md b/docs/source/torch/auto_deploy/auto-deploy.md index fc00c0ccc3..185e1f321a 100644 --- a/docs/source/torch/auto_deploy/auto-deploy.md +++ b/docs/source/torch/auto_deploy/auto-deploy.md @@ -59,6 +59,7 @@ The exported graph then undergoes a series of automated transformations, includi - [Incorporating AutoDeploy into Your Own Workflow](./advanced/workflow.md) - [Expert Configurations](./advanced/expert_configurations.md) - [Performance Benchmarking](./advanced/benchmarking_with_trtllm_bench.md) +- [Serving with trtllm-serve](./advanced/serving_with_trtllm_serve.md) ## Roadmap diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py index 414039a506..01fb0deb57 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/flashinfer_attention.py @@ -198,7 +198,6 @@ def prepare_flashinfer_metadata( flashinfer.get_seq_lens(paged_kv_indptr, paged_kv_last_page_len, page_size), position_ids.numel(), ) - # return metadata return ( qo_indptr, diff --git a/tensorrt_llm/_torch/auto_deploy/llm_args.py b/tensorrt_llm/_torch/auto_deploy/llm_args.py index 812dfea29c..9811274a8b 100644 --- a/tensorrt_llm/_torch/auto_deploy/llm_args.py +++ b/tensorrt_llm/_torch/auto_deploy/llm_args.py @@ -274,6 +274,16 @@ class LlmArgs(AutoDeployConfig, BaseLlmArgs, BaseSettings): self._quant_config = value ### VALIDATION ################################################################################# + @field_validator("max_seq_len", mode="before") + @classmethod + def ensure_max_seq_len(cls, value: Any, info: ValidationInfo) -> Any: + if value is None: + # Fallback to the AutoDeployConfig default when not provided + return AutoDeployConfig.model_fields["max_seq_len"].get_default( + call_default_factory=True + ) + return value + @field_validator("build_config", mode="before") @classmethod def ensure_no_build_config(cls, value: Any, info: ValidationInfo) -> Any: diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py index 07eb13d796..c1013eb3c5 100644 --- a/tensorrt_llm/commands/serve.py +++ b/tensorrt_llm/commands/serve.py @@ -14,6 +14,7 @@ from torch.cuda import device_count from tensorrt_llm import LLM as PyTorchLLM from tensorrt_llm import MultimodalEncoder from tensorrt_llm._tensorrt_engine import LLM +from tensorrt_llm._torch.auto_deploy.llm import LLM as AutoDeployLLM from tensorrt_llm._utils import mpi_rank from tensorrt_llm.executor.utils import LlmLauncherEnvs from tensorrt_llm.llmapi import (BuildConfig, CapacitySchedulerPolicy, @@ -109,7 +110,7 @@ def get_llm_args(model: str, capacity_scheduler_policy=CapacitySchedulerPolicy.GUARANTEED_NO_EVICT, dynamic_batch_config=dynamic_batch_config, ) - + backend = backend if backend in ["pytorch", "_autodeploy"] else None llm_args = { "model": model, @@ -140,7 +141,7 @@ def get_llm_args(model: str, "kv_cache_config": kv_cache_config, "backend": - backend if backend == "pytorch" else None, + backend, "num_postprocess_workers": num_postprocess_workers, "postprocess_tokenizer_dir": @@ -162,9 +163,15 @@ def launch_server(host: str, backend = llm_args["backend"] model = llm_args["model"] - if backend == 'pytorch': llm = PyTorchLLM(**llm_args) + elif backend == '_autodeploy': + # AutoDeploy does not support build_config + llm_args.pop("build_config", None) + # TODO(https://github.com/NVIDIA/TensorRT-LLM/issues/7142): + # AutoDeploy does not support cache reuse yet. + llm_args["kv_cache_config"].enable_block_reuse = False + llm = AutoDeployLLM(**llm_args) else: llm = LLM(**llm_args) @@ -204,10 +211,13 @@ def launch_mm_encoder_server( default="localhost", help="Hostname of the server.") @click.option("--port", type=int, default=8000, help="Port of the server.") -@click.option("--backend", - type=click.Choice(["pytorch", "trt"]), - default="pytorch", - help="Set to 'pytorch' for pytorch path. Default is cpp path.") +@click.option( + "--backend", + type=click.Choice(["pytorch", "trt", "_autodeploy"]), + default="pytorch", + help= + "Set to 'pytorch' for pytorch path and '_autodeploy' for autodeploy path. Default is pytorch path." +) @click.option('--log_level', type=click.Choice(severity_map.keys()), default='info', diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py index da64969337..d761ae6851 100644 --- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py +++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py @@ -30,6 +30,8 @@ class TestLlama3_1_8B(LlmapiAccuracyTestHarness): return { 'skip_tokenizer_init': False, 'trust_remote_code': True, + # TODO(https://github.com/NVIDIA/TensorRT-LLM/issues/7142): + # AutoDeploy does not support cache reuse yet. 'kv_cache_config': { 'enable_block_reuse': False, }, From c232ba8157eba3bb739cc65863d6491287ab18ab Mon Sep 17 00:00:00 2001 From: tomeras91 <57313761+tomeras91@users.noreply.github.com> Date: Fri, 22 Aug 2025 19:15:20 +0300 Subject: [PATCH 29/33] [TRTLLM-4921][feat] Enable chunked prefill for Nemotron-H (#6334) Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Signed-off-by: tomeras91 <57313761+tomeras91@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- .../_torch/models/modeling_nemotron_h.py | 4 +- .../_torch/modules/mamba/mamba2_metadata.py | 94 +++++- .../_torch/modules/mamba/mamba2_mixer.py | 13 +- .../_torch/modules/mamba/ssd_chunk_scan.py | 9 +- .../_torch/modules/mamba/ssd_combined.py | 9 +- .../_torch/modules/mamba/ssd_state_passing.py | 85 ++++-- .../modeling/test_modeling_nemotron_h.py | 65 +++- .../_torch/thop/test_causal_conv1d_op.py | 18 +- .../thop/test_mamba2_chunk_ss_update.py | 286 +++++++++++++++++- tests/unittest/utils/torch_ref.py | 14 +- 10 files changed, 541 insertions(+), 56 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_nemotron_h.py b/tensorrt_llm/_torch/models/modeling_nemotron_h.py index 41f870f890..e548d09a08 100644 --- a/tensorrt_llm/_torch/models/modeling_nemotron_h.py +++ b/tensorrt_llm/_torch/models/modeling_nemotron_h.py @@ -221,7 +221,9 @@ class NemotronHModel(DecoderModel): ) if self.mamba_metadata is None or self.mamba_metadata.max_batch_size != attn_metadata.max_num_requests: - self.mamba_metadata = Mamba2Metadata(attn_metadata.max_num_requests) + self.mamba_metadata = Mamba2Metadata( + attn_metadata.max_num_requests, + chunk_size=self.model_config.pretrained_config.chunk_size) self.mamba_metadata.prepare(attn_metadata) if inputs_embeds is None: diff --git a/tensorrt_llm/_torch/modules/mamba/mamba2_metadata.py b/tensorrt_llm/_torch/modules/mamba/mamba2_metadata.py index 445c288e6f..d421cc9209 100644 --- a/tensorrt_llm/_torch/modules/mamba/mamba2_metadata.py +++ b/tensorrt_llm/_torch/modules/mamba/mamba2_metadata.py @@ -13,15 +13,83 @@ # See the License for the specific language governing permissions and # limitations under the License. +import math +from typing import Tuple + import torch from tensorrt_llm._torch.attention_backend.interface import AttentionMetadata +def cu_seqlens_to_chunk_indices_offsets( + cu_seqlens: torch.Tensor, + chunk_size: int) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + cu_seqlens (torch.Tensor): 1D tensor of cumulative sequence lengths, shape (num_seqs + 1,). The first element should be 0. Each entry represents the starting index of a sequence in the flattened token array. + chunk_size (int): The size of each physical mamba chunk (number of tokens per chunk). + + Returns: + Tuple[torch.Tensor, torch.Tensor]: A tuple containing: + - chunk_indices (torch.Tensor): 1D tensor of indices indicating the physical chunk for each logical chunk. + - chunk_offsets (torch.Tensor): 1D tensor of offsets indicating the starting index of each logical chunk within its physical chunk. + + This function computes the chunk indices and offsets for the given cu_seqlens and chunk_size. + Both are tensors of integers with length N, where N is the number of logical (pseudo) chunks. + A logical chunk is a sequence of tokens that are all part of the same sequence and are all in the same physical mamba chunk. + In other words, a logical chunk changes every time we cross a sequence boundary or a physical mamba chunk boundary. + Logical chunks are needed to handle batched requests with initial states (see _state_passing_fwd and _chunk_scan_fwd). + The chunk_indices tensor contains the index of the physical chunk for each logical chunk. + The chunk_offsets tensor contains the offset (AKA starting index) of the logical chunk in the physical chunk. + + Example: + cu_seqlens = [0, 5, 10] + chunk_size = 8 + -> chunk_indices = [0, 1, 0] + -> chunk_offsets = [0, 5, 0] + + In this example, we have 2 sequences, each with 5 tokens. The physical chunk size is 8 tokens. + We have three logical chunks: + - the first logical chunk starts at token 0 in the first physical chunk and contains all 5 tokens from the first sequence + - the second logical chunk starts at token 5 in the first physical chunk and contains first 3 tokens from the second sequence + - the third logical chunk starts at token 0 in the second physical chunk and contains the remaining 2 tokens from the second sequence + """ + + total_seqlens = cu_seqlens[-1] + cu_seqlens = cu_seqlens[1:] # remove prepended 0 + + # outputs will have length expansion of chunks that do not divide + # chunk_size + N = math.ceil(total_seqlens / chunk_size) + (cu_seqlens[:-1] % chunk_size + > 0).sum() + chunk_indices = torch.arange(N, dtype=torch.int, device=cu_seqlens.device) + chunk_offsets = torch.zeros((N, ), + dtype=torch.int, + device=cu_seqlens.device) + + p = 0 # num of insertions + for s, e in zip(cu_seqlens[:-1], cu_seqlens[1:]): + + # if does not divide chunk_size, then there is one chunk insertion + p += (s % chunk_size > 0) + + # get the dimensions + # - the + 1 for _e is to shift the boundary by one chunk + # - this shifting is not needed if chunk_size divides e + _s, _e = s // chunk_size + p, e // chunk_size + p + (e % chunk_size > 0) + + # adjust inidces and offsets + chunk_indices[_s:_e] -= p + chunk_offsets[_s] = s % chunk_size + + return chunk_indices, chunk_offsets + + class Mamba2Metadata: - def __init__(self, max_batch_size: int): + def __init__(self, max_batch_size: int, chunk_size: int): self.max_batch_size = max_batch_size + self.chunk_size = chunk_size # cumulative sequence lengths for prefill requests [batch_size+1] self.cu_seqlens = torch.zeros(max_batch_size + 1, @@ -31,9 +99,18 @@ class Mamba2Metadata: # sequence index for prefill requests [num_prefill_tokens] - specifies which request each token belongs to self.seq_idx: torch.Tensor = None + # helper tensors for chunked prefill + self.has_initial_states = torch.zeros(max_batch_size, + dtype=torch.bool, + device="cuda") + self.use_initial_states = False + self.chunk_indices: torch.Tensor = None + self.chunk_offsets: torch.Tensor = None + def prepare(self, attn_metadata: AttentionMetadata): num_contexts = attn_metadata.num_contexts context_lens = attn_metadata.seq_lens_cuda[:num_contexts] + num_ctx_tokens = attn_metadata.num_ctx_tokens if num_contexts > 0: torch.cumsum(context_lens, dim=0, @@ -44,4 +121,17 @@ class Mamba2Metadata: dtype=torch.int, device=self.cu_seqlens.device), repeats=context_lens, - output_size=self.cu_seqlens[num_contexts]).unsqueeze(0) + output_size=num_ctx_tokens).unsqueeze(0) + + num_cached_tokens_per_seq = attn_metadata.kv_cache_params.num_cached_tokens_per_seq + self.has_initial_states[:num_contexts] = torch.tensor( + num_cached_tokens_per_seq[:num_contexts]) > 0 + # precomputed bool to avoid host<->device syncs during forward pass + self.use_initial_states = torch.any( + self.has_initial_states[:num_contexts]).item() + if self.use_initial_states: + self.chunk_indices, self.chunk_offsets = cu_seqlens_to_chunk_indices_offsets( + self.cu_seqlens[:num_contexts + 1], self.chunk_size) + else: + self.chunk_indices = None + self.chunk_offsets = None diff --git a/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py b/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py index 6ea096bb6a..d5a3e3996a 100644 --- a/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py +++ b/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py @@ -191,12 +191,15 @@ class Mamba2Mixer(nn.Module): cu_seqlens = mamba_metadata.cu_seqlens[:num_prefills + 1] seq_idx = mamba_metadata.seq_idx + has_initial_states = mamba_metadata.has_initial_states[: + num_prefills] xbc_p = causal_conv1d_fn(xbc_p.transpose(0, 1), self.conv1d.weight, self.conv1d.bias, activation="silu", conv_states=conv_states, + has_initial_state=has_initial_states, query_start_loc=cu_seqlens, cache_indices=state_indices_p).transpose( 0, 1) @@ -216,6 +219,12 @@ class Mamba2Mixer(nn.Module): "b l (h p) -> b l h p", h=self.tp_nheads) + initial_states = None + if mamba_metadata.use_initial_states: + initial_states = torch.where( + has_initial_states[:, None, None, None], + ssm_states[state_indices_p], 0) + y, current_ssm_states = mamba_chunk_scan_combined( x_p, dt_p, @@ -226,7 +235,9 @@ class Mamba2Mixer(nn.Module): D=self.D, z=z_p, dt_bias=self.dt_bias, - initial_states=None, + initial_states=initial_states, + chunk_indices=mamba_metadata.chunk_indices, + chunk_offsets=mamba_metadata.chunk_offsets, dt_softplus=self.delta_softplus, cu_seqlens=cu_seqlens, seq_idx=seq_idx, diff --git a/tensorrt_llm/_torch/modules/mamba/ssd_chunk_scan.py b/tensorrt_llm/_torch/modules/mamba/ssd_chunk_scan.py index 58615ab923..23b55d8811 100644 --- a/tensorrt_llm/_torch/modules/mamba/ssd_chunk_scan.py +++ b/tensorrt_llm/_torch/modules/mamba/ssd_chunk_scan.py @@ -314,11 +314,12 @@ def _chunk_scan_fwd_kernel( # get the cs at the offset boundary # - c_off == 0 is a passthrough + # - We need dA_cs at the boundary, defined by c_off - no need + # to increase pointer by pid_m (it is a constant offset, + # i.e. the same for all blocks) dA_cs_m_boundary = tl.load( - dA_cumsum_ptr + - (pid_m * BLOCK_SIZE_M + c_off - 1) * stride_dA_cs_csize, - mask=(((pid_m * BLOCK_SIZE_M + c_off - 1) > -1) - and ((pid_m * BLOCK_SIZE_M + c_off) < chunk_size)), + dA_cumsum_ptr + (c_off - 1) * stride_dA_cs_csize, + mask=(((c_off - 1) > -1) and (c_off < chunk_size)), other=0.0).to(tl.float32) if HAS_SEQ_IDX: diff --git a/tensorrt_llm/_torch/modules/mamba/ssd_combined.py b/tensorrt_llm/_torch/modules/mamba/ssd_combined.py index 0a6f18bb63..8edbe902bd 100644 --- a/tensorrt_llm/_torch/modules/mamba/ssd_combined.py +++ b/tensorrt_llm/_torch/modules/mamba/ssd_combined.py @@ -110,21 +110,24 @@ def _mamba_chunk_scan_combined_fwd( # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries # (middle term of factorization of off-diag blocks; A terms) # - for handling chunked prefill, this requires i) initial_states - # ii) seq_idx and iii) is_cont_batched to be all specified. + # ii) seq_idx iii) is_cont_batched and (iv) chunk_offsets to be all specified. # - When a new seq_idx is detected, we will stop passing the prev_state # and switch accordingly to the init_state corresponding to the new seq_idx. + # - We will also make sure that the dA_cumsum is taken only from the start of the + # sequence (hence we need the full dA_cumsum tensor and not just the values at chunk boundaries) # - this will ensure that states will be updated with the rightmost flushed seq_idx # of the previous chunk. This implies that the first chunk of states is either 0 # or equal to init_states of the first example. states, final_states = _state_passing_fwd( rearrange(states, "... p n -> ... (p n)"), - dA_cumsum[:, :, :, -1], + dA_cumsum, initial_states=(rearrange(initial_states, "... p n -> ... (p n)") if initial_states is not None else None), seq_idx=seq_idx, chunk_size=chunk_size, out_dtype=mamba_ssm_cache_dtype or C.dtype, - is_cont_batched=cu_seqlens is not None) + is_cont_batched=cu_seqlens is not None, + chunk_offsets=chunk_offsets) states, final_states = [ rearrange(t, "... (p n) -> ... p n", n=dstate) for t in [states, final_states] diff --git a/tensorrt_llm/_torch/modules/mamba/ssd_state_passing.py b/tensorrt_llm/_torch/modules/mamba/ssd_state_passing.py index e1c4b61eaf..f751d4cd5f 100644 --- a/tensorrt_llm/_torch/modules/mamba/ssd_state_passing.py +++ b/tensorrt_llm/_torch/modules/mamba/ssd_state_passing.py @@ -41,6 +41,8 @@ def _state_passing_fwd_kernel( dA_cs_ptr, initstates_ptr, seq_idx_ptr, + chunk_offsets_ptr, + chunk_meta_num, # Matrix dimensions dim, nchunks, @@ -61,6 +63,7 @@ def _state_passing_fwd_kernel( stride_dA_cs_batch, stride_dA_cs_chunk, stride_dA_cs_head, + stride_dA_cs_csize, stride_initstates_batch, stride_initstates_head, stride_initstates_dim, @@ -76,7 +79,8 @@ def _state_passing_fwd_kernel( pid_h = tl.program_id(axis=2) pid_m = tl.program_id(axis=0) states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head - dA_cs_ptr += pid_b * stride_dA_cs_batch + pid_h * stride_dA_cs_head + dA_cs_ptr += pid_b * stride_dA_cs_batch + pid_h * stride_dA_cs_head + ( + chunk_size - 1) * stride_dA_cs_csize out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head final_states_ptr += (pid_b * stride_final_states_batch + pid_h * stride_final_states_head) @@ -105,35 +109,63 @@ def _state_passing_fwd_kernel( other=0.0).to(tl.float32) tl.store(out_ptrs, states, mask=offs_m < dim) out_ptrs += stride_out_chunk - seq_idx = 0 + prev_seq_idx_chunk_end = 0 + logical_chunk_idx = 0 for c in range(nchunks): new_states = tl.load(states_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) dA_cs = tl.load(dA_cs_ptr).to(tl.float32) - scale = tl.exp(dA_cs) + scale_mask = True if HAS_SEQ_IDX: # - the seq to pass forward is the one that is flushed to the right # boundary. - # - that is given by seq_idx_new below. - seq_idx_new = tl.load(seq_idx_ptr + - (min((c + 1) * chunk_size, seqlen) - 1) * - stride_seq_idx_seqlen) + # - that is given by seq_idx_chunk_end below: the sequence index at the end of the chunk. + seq_idx_chunk_end = tl.load(seq_idx_ptr + (min( + (c + 1) * chunk_size, seqlen) - 1) * stride_seq_idx_seqlen) if HAS_INITSTATES: - if IS_CONT_BATCHED and seq_idx != seq_idx_new: + if IS_CONT_BATCHED and prev_seq_idx_chunk_end != seq_idx_chunk_end: # this means in the current chunk the rightmost flushed seq # has changed. # - so we do not propagate the state from previous chunk # - but rather we load that sequence's init state - initstates_ptrs = initstates_ptr + seq_idx_new * stride_initstates_batch + initstates_ptrs = initstates_ptr + seq_idx_chunk_end * stride_initstates_batch # - update state with seq_idx_new's init state states = tl.load(initstates_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32) - else: - scale = tl.where(seq_idx_new == seq_idx, scale, 0.0) - seq_idx = seq_idx_new + # - we need to consider the cumsum only of the last sequence in the chunk + # - find its starting position (given by c_off of the logical chunk index) + # - and subtract the cumsum just before that position from the total cumsum + # - first, update the logical chunk index (add the number of sequences in the current physical chunk): + # sequence index at the start of the current chunk + seq_idx_chunk_start = tl.load(seq_idx_ptr + + min(c * chunk_size, seqlen) * + stride_seq_idx_seqlen) + logical_chunk_idx += (seq_idx_chunk_end - + seq_idx_chunk_start) + # - load the chunk offset: + c_off = tl.load(chunk_offsets_ptr + logical_chunk_idx, + mask=logical_chunk_idx < chunk_meta_num, + other=0) + # - if offset is 0, then the sequence starts at the beginning of the chunk, and we don't need to subtract anything + if c_off > 0: + # - dA_cs_ptr currently points to the cumsum at the end of the chunk - subtract the chunk size and add the offset + dA_cs_boundary = tl.load( + dA_cs_ptr - (chunk_size - 1) * stride_dA_cs_csize + + (c_off - 1) * stride_dA_cs_csize, + mask=(c_off - 1) > -1 and c_off < chunk_size, + other=0.0) + dA_cs -= dA_cs_boundary + + # - increment logical chunk index for every physical chunk + logical_chunk_idx += 1 + else: + scale_mask = seq_idx_chunk_end == prev_seq_idx_chunk_end + prev_seq_idx_chunk_end = seq_idx_chunk_end + + scale = tl.where(scale_mask, tl.exp(dA_cs), 0.0) states = scale * states + new_states if c < nchunks - 1: tl.store(out_ptrs, states, mask=offs_m < dim) @@ -146,28 +178,36 @@ def _state_passing_fwd_kernel( def _state_passing_fwd( states, - dA_chunk_cumsum, + dA_cumsum, initial_states=None, seq_idx=None, chunk_size=None, out_dtype=None, is_cont_batched=False, + chunk_offsets=None, ): batch, nchunks, nheads, dim = states.shape - assert dA_chunk_cumsum.shape == (batch, nheads, nchunks) + if chunk_size is None: + chunk_size = dA_cumsum.shape[-1] + else: + assert chunk_size == dA_cumsum.shape[-1] + assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size) if initial_states is not None: if is_cont_batched: # - if cu_seqlens is provided, then the initial states # are used for continuous batching. In which case we # require seq_idx to be provided - assert seq_idx is not None, "" + assert seq_idx is not None, "seq_idx must be provided for continuous batching" + # - we also need chunk_offsets to be provided, to account + # for computation of dA_cumsum from the start of the + # sequence + assert chunk_offsets is not None, "chunk_offsets must be provided for continuous batching" else: # - this is the regular batching case, where initial # states are used are for each example of the batch. assert initial_states.shape == (batch, nheads, dim) if seq_idx is not None: - assert chunk_size is not None seqlen = seq_idx.shape[-1] assert seq_idx.shape == (batch, seqlen) out_dtype = states.dtype if out_dtype is None else out_dtype @@ -183,13 +223,15 @@ def _state_passing_fwd( states, out, final_states, - dA_chunk_cumsum, + dA_cumsum, initial_states, seq_idx, + chunk_offsets, + len(chunk_offsets) if chunk_offsets is not None else 0, dim, nchunks, seqlen if seq_idx is not None else 0, - chunk_size if seq_idx is not None else 0, + chunk_size, states.stride(0), states.stride(1), states.stride(2), @@ -201,9 +243,10 @@ def _state_passing_fwd( final_states.stride(0), final_states.stride(1), final_states.stride(2), - dA_chunk_cumsum.stride(0), - dA_chunk_cumsum.stride(2), - dA_chunk_cumsum.stride(1), + dA_cumsum.stride(0), + dA_cumsum.stride(2), + dA_cumsum.stride(1), + dA_cumsum.stride(3), *(( initial_states.stride(0), initial_states.stride(1), diff --git a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py index 58c854931e..3e727e654b 100644 --- a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py +++ b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py @@ -33,7 +33,9 @@ def extract_decode_logprobs(result: RequestOutput, def create_nemotron_h_llm(use_cuda_graph, disable_overlap_scheduler, max_batch_size, - mamba_ssm_cache_dtype=None): + mamba_ssm_cache_dtype=None, + enable_chunked_prefill=False, + max_num_tokens=None): """Create LLM with specific overlap scheduler setting""" model_dir = f"{llm_models_root(check=True)}/Nemotron-H-8B-Base-8K" return LLM( @@ -47,6 +49,8 @@ def create_nemotron_h_llm(use_cuda_graph, mamba_ssm_cache_dtype="auto" if mamba_ssm_cache_dtype is None else mamba_ssm_cache_dtype), sampler_type="TRTLLMSampler", + enable_chunked_prefill=enable_chunked_prefill, + max_num_tokens=max_num_tokens, ) @@ -336,3 +340,62 @@ def test_nemotron_h_cuda_graph_overlap_scheduler(): msg=lambda x: f"Prompt {i}: with/without overlap scheduler (with CG) logprobs for all selected tokens {x}" ) + + +def test_nemotron_h_chunked_prefill(): + # Long prompts (~100 tokens) to make sure chunked prefill is enabled + # (At the time of development, tokens_per_block isn't configurable from the LLM API, + # and max_tokens (i.e. chunk size) needs to be a multiple of tokens_per_block) + prompts = [ + "Artificial Intelligence in Healthcare: Artificial intelligence (AI) is transforming healthcare by improving diagnostics, treatment plans, and patient care. AI algorithms can analyze medical images with high accuracy, assist in early disease detection, and personalize treatment plans based on patient data. Additionally, AI-powered chatbots and virtual assistants provide support to patients, enhancing accessibility and efficiency in healthcare services. As AI technology continues to advance, its integration into healthcare systems promises to deliver better outcomes and reduce costs. With continuous research and development, AI in healthcare is poised to", + "The Role of Cloud Computing: Cloud computing has revolutionized the way businesses operate by providing scalable, on-demand access to computing resources. This technology allows organizations to store and process data remotely, reducing the need for physical infrastructure and enabling greater flexibility. Cloud services facilitate collaboration, enhance data security, and support the deployment of innovative applications. As businesses increasingly adopt cloud solutions, they benefit from improved efficiency, cost savings, and the ability to rapidly adapt to changing market conditions. Companies leveraging cloud computing are better positioned to", + "Advancements in Renewable Energy: Renewable energy technologies, such as solar and wind power, are crucial for addressing climate change and reducing dependence on fossil fuels. Advances in energy storage, grid integration, and efficiency are making renewable energy sources more viable and cost-effective. Innovations in materials science and engineering are also driving the development of next-generation renewable technologies. As global efforts to combat climate change intensify, the continued advancement of renewable energy will play a pivotal role in achieving a sustainable future. Governments and industries are increasingly investing in", + "The Importance of Cybersecurity: In today's digital age, cybersecurity has become essential to protect sensitive information and maintain the integrity of systems. With the rise of cyber threats such as hacking, phishing, and ransomware, organizations must implement robust security measures to safeguard their data. Cybersecurity involves a combination of technologies, processes, and practices designed to defend against unauthorized access and attacks. By staying vigilant and updating security protocols, businesses can mitigate risks and ensure the safety of their digital assets. Proactive cybersecurity strategies are crucial in", + "The Impact of Artificial Intelligence on Education: Artificial intelligence is reshaping education by providing personalized learning experiences and automating administrative tasks. AI-driven educational tools can adapt to individual student needs, offering tailored feedback and resources to enhance learning outcomes. Additionally, AI can streamline administrative processes, allowing educators to focus more on teaching and student engagement. As AI continues to evolve, its role in education will expand, offering new opportunities for innovation and efficiency. The integration of AI in classrooms promises to revolutionize how students learn and how educators manage their", + ] + sampling_config = SamplingParams(max_tokens=10, + temperature=0.0, + return_context_logits=True, + return_generation_logits=True) + + with create_nemotron_h_llm(use_cuda_graph=False, + disable_overlap_scheduler=True, + max_batch_size=16) as llm: + outputs = llm.generate(prompts, + sampling_params=sampling_config, + use_tqdm=True) + + with create_nemotron_h_llm(use_cuda_graph=False, + disable_overlap_scheduler=True, + max_batch_size=16, + enable_chunked_prefill=True, + max_num_tokens=64) as llm: + chunked_prefill_outputs = llm.generate(prompts, + sampling_params=sampling_config, + use_tqdm=True) + + for i, (output, chunked_prefill_output) in enumerate( + zip(outputs, chunked_prefill_outputs)): + assert output.outputs[0].text == chunked_prefill_output.outputs[0].text + + # assert same prefill logprobs. Same atol as diff between mcore and initial impl + prefill_logprobs = extract_prefill_logprobs(output) + chunked_prefill_logprobs = extract_prefill_logprobs( + chunked_prefill_output) + torch.testing.assert_close( + prefill_logprobs, + chunked_prefill_logprobs, + atol=0.3, + rtol=0.05, + msg=lambda x: f"Prompt {i} prefill logprobs {x}") + + # Decode logprobs shouldn't be affected by chunked prefill - tolerance like batching tolerance + decode_logprobs = extract_decode_logprobs(output) + chunked_decode_logprobs = extract_decode_logprobs( + chunked_prefill_output) + torch.testing.assert_close( + decode_logprobs, + chunked_decode_logprobs, + atol=0.2, + rtol=0.05, + msg=lambda x: f"Prompt {i} decode logprobs {x}") diff --git a/tests/unittest/_torch/thop/test_causal_conv1d_op.py b/tests/unittest/_torch/thop/test_causal_conv1d_op.py index c5e42e2618..54793854c9 100644 --- a/tests/unittest/_torch/thop/test_causal_conv1d_op.py +++ b/tests/unittest/_torch/thop/test_causal_conv1d_op.py @@ -26,11 +26,15 @@ from tensorrt_llm.llmapi.utils import get_total_gpu_memory @pytest.mark.parametrize( - "dim, dconv, req_type, dtype, batch_size, max_seq_len, remove_padding, apply_silu, paged_cache", + "dim, dconv, req_type, dtype, batch_size, max_seq_len, remove_padding, apply_silu, paged_cache, use_initial_state", list( product([2048], [4], ['context', 'generation'], ['float16', 'float32', 'bfloat16'], [5], [16], [False, True], - [False, True], [False, True])) + + [False, True], [False, True], [False])) + + # test with initial state + list( + product([2048], [4], ['context'], ['bfloat16'], [5], [16], + [False, True], [False], [False, True], [True])) + # long sequence tests to cover the int overflow issue list( map( @@ -42,10 +46,11 @@ from tensorrt_llm.llmapi.utils import get_total_gpu_memory "The long sequence test needs at least 33GB memory, skipping" )), product([5376], [4], ['context'], ['float16', 'bfloat16'], [2], - [131072], [False, True], [False, True], [False])))) + [131072], [False, True], [False, True], [False], [False])))) @pytest.mark.high_cuda_memory def test_causal_conv1d(dim, dconv, req_type, dtype, batch_size, max_seq_len, - remove_padding, apply_silu, paged_cache): + remove_padding, apply_silu, paged_cache, + use_initial_state): device = "cuda" seq_len = max_seq_len if req_type == "context" else 1 mean = 0.0 @@ -68,7 +73,7 @@ def test_causal_conv1d(dim, dconv, req_type, dtype, batch_size, max_seq_len, host_context_lengths = torch.ones( (batch_size, ), dtype=torch.int32) * seq_len - if req_type == "context": + if req_type == "context" and not use_initial_state: conv_state = torch.zeros([batch_size, dim, dconv - 1], dtype=torch_dtype, device=device) @@ -111,7 +116,8 @@ def test_causal_conv1d(dim, dconv, req_type, dtype, batch_size, max_seq_len, conv_weight_input = conv_weight.squeeze(1).contiguous() if req_type == "context": - has_initial_state = None + has_initial_state = None if not use_initial_state else torch.ones( + batch_size, device=device, dtype=torch.bool) torch.ops.trtllm.causal_conv1d_fwd( x_in_out, diff --git a/tests/unittest/_torch/thop/test_mamba2_chunk_ss_update.py b/tests/unittest/_torch/thop/test_mamba2_chunk_ss_update.py index ea3c2c2c3c..e26fe00776 100644 --- a/tests/unittest/_torch/thop/test_mamba2_chunk_ss_update.py +++ b/tests/unittest/_torch/thop/test_mamba2_chunk_ss_update.py @@ -21,6 +21,8 @@ from einops import rearrange, repeat from utils.torch_ref import (selective_state_update_ref, ssd_chunk_scan_combined_ref) +from tensorrt_llm._torch.modules.mamba.mamba2_metadata import \ + cu_seqlens_to_chunk_indices_offsets from tensorrt_llm._torch.modules.mamba.selective_state_update import \ selective_state_update from tensorrt_llm._torch.modules.mamba.ssd_combined import \ @@ -30,51 +32,58 @@ from tensorrt_llm.llmapi.utils import get_total_gpu_memory @pytest.mark.parametrize( - "dim, headdim, ngroups, dstate, req_type, dtype, batch_size, max_seq_len, has_z, remove_padding, paged_cache", + "dim, headdim, ngroups, dstate, req_type, dtype, batch_size, max_seq_len, has_z, remove_padding, paged_cache, use_initial_states", # dim parametrization list( product([1024, 2048, 5120], [64], [1], [128], ['context', 'generation'], - ['bfloat16'], [3], [16], [False], [True], [False])) + + ['bfloat16'], [3], [16], [False], [True], [False], [False])) + # headdim parametrization list( product([2048], [32, 64, 128, 256], [1], [128], ['context', 'generation'], ['bfloat16'], [3], [16], [False], - [True], [False])) + + [True], [False], [False])) + # ngroups parametrization list( product([2048], [64], [1, 4], [128], ['context', 'generation'], - ['bfloat16'], [3], [16], [False], [True], [False])) + + ['bfloat16'], [3], [16], [False], [True], [False], [False])) + # dstate parametrization list( product([2048], [64], [1], [64, 96, 128, 256], ['context', 'generation'], ['bfloat16'], [3], [16], [False], - [True], [False])) + + [True], [False], [False])) + # dtype parametrization list( product([2048], [64], [1], [128], ['context', 'generation'], ['float16', 'bfloat16', 'float32'], [3], [16], [False], [True], - [False])) + + [False], [False])) + # batch_size parametrization list( product([2048], [64], [1], [128], ['context', 'generation'], - ['bfloat16'], [1, 2, 8, 16], [16], [False], [True], [False])) + + ['bfloat16'], [1, 2, 8, 16], [16], [False], [True], [False], + [False])) + # max_seq_len parametrization list( product([2048], [64], [1], [128], ['context', 'generation'], ['bfloat16'], [3], [32, 64, 256, 2048, 16384], [False], [True], - [False])) + + [False], [False])) + # has_z parametrization list( product([2048], [64], [1], [128], ['context', 'generation'], - ['bfloat16'], [3], [32], [True, False], [True], [False])) + + ['bfloat16'], [3], [32], [True, False], [True], [False], + [False])) + # remove_padding parametrization list( product([2048], [64], [1], [128], ['context', 'generation'], - ['bfloat16'], [3], [32], [False], [True, False], [False])) + + ['bfloat16'], [3], [32], [False], [True, False], [False], + [False])) + # paged_cache parametrization (relevant for generation only) list( product([2048], [64], [1], [128], ['generation'], ['bfloat16'], [3], - [32], [False], [False], [True, False])) + + [32], [False], [False], [True, False], [False])) + + # use_initial_states parametrization (relevant for context only and remove_padding=True) + list( + product([2048], [64], [1], [128], ['context'], ['bfloat16'], [3], [32], + [False], [True], [False], [True, False])) + # long sequence test to cover the int overflow issue [ pytest.param( @@ -89,6 +98,7 @@ from tensorrt_llm.llmapi.utils import get_total_gpu_memory False, False, False, + False, marks=pytest.mark.skipif( get_total_gpu_memory(0) < 68 * 1024**3, reason= @@ -97,7 +107,8 @@ from tensorrt_llm.llmapi.utils import get_total_gpu_memory def test_mamba2_chunk_scan_selective_state_update(dim, headdim, ngroups, dstate, req_type, dtype, batch_size, max_seq_len, has_z, - remove_padding, paged_cache): + remove_padding, paged_cache, + use_initial_states): # configs device = "cuda" seq_len = max_seq_len if req_type == 'context' else 1 @@ -168,6 +179,8 @@ def test_mamba2_chunk_scan_selective_state_update(dim, headdim, ngroups, dstate, D = torch.randn(nheads, device=device) if has_z: z = torch.randn_like(x) + if use_initial_states: + initial_states = state.clone() if req_type == 'generation': # remove the seqlen dimension @@ -193,8 +206,13 @@ def test_mamba2_chunk_scan_selective_state_update(dim, headdim, ngroups, dstate, C_ref = C.detach().clone() D_ref = D.detach().clone() z_ref = z.detach().clone() if has_z else None + initial_states_ref = state_ref.clone() if use_initial_states else None if req_type == "context": + if use_initial_states: + assert remove_padding + chunk_indices, chunk_offsets = cu_seqlens_to_chunk_indices_offsets( + cu_seqlens, chunk_size) out, ssm_state = mamba_chunk_scan_combined( x, dt, @@ -205,6 +223,9 @@ def test_mamba2_chunk_scan_selective_state_update(dim, headdim, ngroups, dstate, D=D, z=z if has_z else None, dt_bias=dt_bias, + initial_states=initial_states if use_initial_states else None, + chunk_indices=chunk_indices if use_initial_states else None, + chunk_offsets=chunk_offsets if use_initial_states else None, seq_idx=seq_idx if remove_padding else None, cu_seqlens=cu_seqlens if remove_padding else None, dt_softplus=delta_softplus, @@ -273,7 +294,10 @@ def test_mamba2_chunk_scan_selective_state_update(dim, headdim, ngroups, dstate, D=D_ref, z=z_ref[:, start:end, ...] if has_z else None, dt_bias=dt_bias_ref, - dt_softplus=delta_softplus) + dt_softplus=delta_softplus, + initial_states=initial_states_ref[i:i + 1, ...] + if use_initial_states else None, + ) out_ref[0, start:end, ...] = part_out_ref.squeeze(0) state_ref[i, ...] = part_state_ref.squeeze(0) elif long_context: @@ -295,7 +319,10 @@ def test_mamba2_chunk_scan_selective_state_update(dim, headdim, ngroups, dstate, D=D_ref, z=z_ref[i:i + 1, ...] if has_z else None, dt_bias=dt_bias_ref, - dt_softplus=delta_softplus) + dt_softplus=delta_softplus, + initial_states=initial_states_ref[i:i + 1, ...] + if use_initial_states else None, + ) out_ref[i, ...] = part_out_ref.squeeze(0) state_ref[i, ...] = part_state_ref.squeeze(0) else: @@ -309,7 +336,10 @@ def test_mamba2_chunk_scan_selective_state_update(dim, headdim, ngroups, dstate, D=D_ref, z=z_ref if has_z else None, dt_bias=dt_bias_ref, - dt_softplus=delta_softplus) + dt_softplus=delta_softplus, + initial_states=initial_states_ref + if use_initial_states else None, + ) elif req_type == 'generation': out_ref = selective_state_update_ref(state_ref, x_ref, @@ -330,3 +360,229 @@ def test_mamba2_chunk_scan_selective_state_update(dim, headdim, ngroups, dstate, state_ref, rtol=1e-2, atol=atol[dtype]) + + +@pytest.mark.parametrize("mamba_chunk_size", [8, 256]) +@pytest.mark.parametrize("seqlens", [ + (16, 2, 8, 13), + (270, 88, 212, 203), + (16, 20), +]) +def test_mamba2_chunk_scan_combined_prefill_chunking(mamba_chunk_size, seqlens): + dim = 1024 + headdim = 64 + ngroups = 1 + dstate = 128 + + # test in high precision to distinguish between numeric instabilities and actual errors + dtype = 'float32' + + num_sequences = len(seqlens) + has_z = True + + device = "cuda" + nheads = dim // headdim + delta_softplus = True + mean = 0.0 + std_dev = 0.1 + + torch_dtype = str_dtype_to_torch(dtype) + + seqlens = torch.tensor(seqlens, dtype=torch.int32, device=device) + cu_seqlens = torch.cat([ + torch.tensor([0], dtype=torch.int32, device=device), + torch.cumsum(seqlens, dim=0, dtype=torch.int32) + ], + dim=0) + seq_idx = torch.repeat_interleave(torch.arange(len(seqlens), + dtype=torch.int32, + device=device), + seqlens, + output_size=cu_seqlens[-1]).unsqueeze(0) + input_batch_size = 1 + input_seq_len = cu_seqlens[-1] + + # test data + torch.random.manual_seed(0) + x = torch.empty(input_batch_size, + input_seq_len, + nheads, + headdim, + device=device, + dtype=torch_dtype) + x.normal_(mean, std_dev) + dt = torch.randn(input_batch_size, + input_seq_len, + nheads, + device=device, + dtype=torch_dtype) + dt_bias = torch.rand(nheads, device=device) - 4.0 + A = -torch.rand(nheads, device=device) - 1.0 + B = torch.randn(input_batch_size, + input_seq_len, + ngroups, + dstate, + device=device, + dtype=torch_dtype) + C = torch.randn_like(B) + D = torch.randn(nheads, device=device) + + z = torch.randn_like(x) + + ## full seqlen computation + out_ref, state_ref = mamba_chunk_scan_combined( + x, + dt, + A, + B, + C, + chunk_size=mamba_chunk_size, + D=D, + z=z if has_z else None, + dt_bias=dt_bias, + seq_idx=seq_idx, + cu_seqlens=cu_seqlens, + dt_softplus=delta_softplus, + return_final_states=False, + return_varlen_states=True, + ) + + ## chunked seqlen computation + # first chunk + chunked_seqlens = seqlens // 2 + chunked_cu_seqlens = torch.cat([ + torch.tensor([0], dtype=torch.int32, device=device), + torch.cumsum(chunked_seqlens, dim=0, dtype=torch.int32) + ], + dim=0) + chunked_seq_idx = torch.repeat_interleave( + torch.arange(len(chunked_seqlens), dtype=torch.int32, device=device), + chunked_seqlens, + output_size=chunked_cu_seqlens[-1]).unsqueeze(0) + chunked_input_seq_len = chunked_cu_seqlens[-1] + x_chunked = torch.zeros_like(x)[:, :chunked_input_seq_len, ...] + dt_chunked = torch.zeros_like(dt)[:, :chunked_input_seq_len, ...] + B_chunked = torch.zeros_like(B)[:, :chunked_input_seq_len, ...] + C_chunked = torch.zeros_like(C)[:, :chunked_input_seq_len, ...] + z_chunked = torch.zeros_like(z)[:, :chunked_input_seq_len, ...] + for i in range(num_sequences): + # yapf: disable + chunk_f = lambda x, i: x[:, cu_seqlens[i]:cu_seqlens[i] + chunked_seqlens[i], ...] + + x_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(x, i) + dt_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(dt, i) + B_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(B, i) + C_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(C, i) + z_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(z, i) + # yapf: enable + + partial_out, partial_state = mamba_chunk_scan_combined( + x_chunked, + dt_chunked, + A, + B_chunked, + C_chunked, + chunk_size=mamba_chunk_size, + D=D, + z=z_chunked, + dt_bias=dt_bias, + seq_idx=chunked_seq_idx, + cu_seqlens=chunked_cu_seqlens, + dt_softplus=delta_softplus, + return_final_states=False, + return_varlen_states=True, + ) + + # remaining chunk + remaining_chunked_seqlens = seqlens - chunked_seqlens + remaining_chunked_cu_seqlens = torch.cat([ + torch.tensor([0], dtype=torch.int32, device=device), + torch.cumsum(remaining_chunked_seqlens, dim=0, dtype=torch.int32) + ], + dim=0) + remaining_chunked_seq_idx = torch.repeat_interleave( + torch.arange(len(remaining_chunked_seqlens), + dtype=torch.int32, + device=device), + remaining_chunked_seqlens, + output_size=remaining_chunked_cu_seqlens[-1]).unsqueeze(0) + remaining_chunked_input_seq_len = remaining_chunked_cu_seqlens[-1] + # yapf: disable + remaining_x_chunked = torch.zeros_like(x)[:, :remaining_chunked_input_seq_len, ...] + remaining_dt_chunked = torch.zeros_like(dt)[:, :remaining_chunked_input_seq_len, ...] + remaining_B_chunked = torch.zeros_like(B)[:, :remaining_chunked_input_seq_len, ...] + remaining_C_chunked = torch.zeros_like(C)[:, :remaining_chunked_input_seq_len, ...] + remaining_z_chunked = torch.zeros_like(z)[:, :remaining_chunked_input_seq_len, ...] + for i in range(num_sequences): + remaining_chunk_f = lambda x, i: x[:, cu_seqlens[i] + chunked_seqlens[i]:cu_seqlens[i+1], ...] + + remaining_x_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(x, i) + remaining_dt_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(dt, i) + remaining_B_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(B, i) + remaining_C_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(C, i) + remaining_z_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(z, i) + + # assert input chunking is correct + concat_chunk_f = lambda pt1, pt2, i: torch.cat([ + pt1[:,chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1],...], + pt2[:,remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1],...], + ], + dim=1) + concat_batch_f = lambda pt1, pt2: torch.cat([concat_chunk_f(pt1, pt2, i) for i in range(num_sequences)], dim=1) + + assert concat_batch_f(x_chunked, remaining_x_chunked).equal(x) + assert concat_batch_f(dt_chunked, remaining_dt_chunked).equal(dt) + assert concat_batch_f(B_chunked, remaining_B_chunked).equal(B) + assert concat_batch_f(C_chunked, remaining_C_chunked).equal(C) + assert concat_batch_f(z_chunked, remaining_z_chunked).equal(z) + # yapf: enable + + chunk_indices, chunk_offsets = cu_seqlens_to_chunk_indices_offsets( + remaining_chunked_cu_seqlens, mamba_chunk_size) + + out_chunked, state_chunked = mamba_chunk_scan_combined( + remaining_x_chunked, + remaining_dt_chunked, + A, + remaining_B_chunked, + remaining_C_chunked, + chunk_size=mamba_chunk_size, + D=D, + z=remaining_z_chunked, + dt_bias=dt_bias, + initial_states=partial_state, + chunk_indices=chunk_indices, + chunk_offsets=chunk_offsets, + seq_idx=remaining_chunked_seq_idx, + cu_seqlens=remaining_chunked_cu_seqlens, + dt_softplus=delta_softplus, + return_final_states=False, + return_varlen_states=True, + ) + out = concat_batch_f(partial_out, out_chunked) + + # kernel chunked is same as kernel overall + # tight tolerance to find subtle correctness issues + rtol = 1e-2 + atol = 2e-3 + for i in range(num_sequences): + out_seq = out[:, cu_seqlens[i]:cu_seqlens[i + 1], ...] + out_seq_ref = out_ref[:, cu_seqlens[i]:cu_seqlens[i + 1], ...] + torch.testing.assert_close(out_seq[:, :chunked_seqlens[i], ...], + out_seq_ref[:, :chunked_seqlens[i], ...], + rtol=rtol, + atol=atol, + msg=lambda x: f"seq{i} output part1 " + x) + torch.testing.assert_close(out_seq[:, chunked_seqlens[i]:, ...], + out_seq_ref[:, chunked_seqlens[i]:, ...], + rtol=rtol, + atol=atol, + msg=lambda x: f"seq{i} output part2 " + x) + + state_seq = state_chunked[i] + state_seq_ref = state_ref[i] + torch.testing.assert_close(state_seq, + state_seq_ref, + rtol=rtol, + atol=atol, + msg=lambda x: f"seq{i} state " + x) diff --git a/tests/unittest/utils/torch_ref.py b/tests/unittest/utils/torch_ref.py index 6e666bed26..d8a6b258c5 100644 --- a/tests/unittest/utils/torch_ref.py +++ b/tests/unittest/utils/torch_ref.py @@ -480,7 +480,8 @@ def ssd_chunk_scan_combined_ref(x, D=None, z=None, dt_bias=None, - dt_softplus=False): + dt_softplus=False, + initial_states=None): """ Argument: x: (batch, seqlen, nheads, headdim) @@ -492,6 +493,7 @@ def ssd_chunk_scan_combined_ref(x, D: (nheads, headdim) or (nheads,) z: (batch, seqlen, nheads, headdim) dt_bias: (nheads,) + initial_states: (batch, nheads, dstate, headdim) Return: out: (batch, seqlen, nheads, headdim) final_states: (batch, nheads, dstate, headdim) @@ -520,8 +522,16 @@ def ssd_chunk_scan_combined_ref(x, states = states.to(torch.float32) # 2. Pass the state to all the chunks by weighted cumsum. # state_passing_ref is much less numerically stable + # align initial_states shape with states shape + initial_states = rearrange( + initial_states, + "... n p -> ... p n") if initial_states is not None else None states, final_states = state_passing_ref( - rearrange(states, "... p n -> ... (p n)"), dA_cumsum[:, :, :, -1]) + rearrange(states, "... p n -> ... (p n)"), + dA_cumsum[:, :, :, -1], + rearrange(initial_states, "... p n-> ... (p n)") + if initial_states is not None else None, + ) states, final_states = [ rearrange(t, "... (p n) -> ... p n", n=dstate) for t in [states, final_states] From 37543a9ad72020bda12117e709ac7b59aeaeea41 Mon Sep 17 00:00:00 2001 From: Robin Kobus <19427718+Funatiq@users.noreply.github.com> Date: Fri, 22 Aug 2025 18:44:17 +0200 Subject: [PATCH 30/33] [None][refactor] Simplify decoder state initialization for speculative decoding (#6869) Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> --- .../batch_manager/createNewDecoderRequests.h | 32 -- .../tensorrt_llm/batch_manager/llmRequest.h | 2 +- .../tensorrt_llm/runtime/decodingInput.h | 2 + cpp/include/tensorrt_llm/runtime/request.h | 54 --- .../createNewDecoderRequests.cpp | 401 +++++++++--------- .../batch_manager/utils/logitsThread.cpp | 15 +- .../batch_manager/utils/logitsThread.h | 6 +- cpp/tensorrt_llm/runtime/decoderState.cpp | 20 +- 8 files changed, 225 insertions(+), 307 deletions(-) delete mode 100644 cpp/include/tensorrt_llm/runtime/request.h diff --git a/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h b/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h index 394f7fb7bf..0978905b5e 100644 --- a/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h +++ b/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h @@ -24,7 +24,6 @@ #include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/iTensor.h" #include "tensorrt_llm/runtime/modelConfig.h" -#include "tensorrt_llm/runtime/request.h" #include "tensorrt_llm/runtime/worldConfig.h" namespace tensorrt_llm::runtime @@ -88,37 +87,6 @@ public: SizeType32 maxSequenceLength, OptionalRef medusaBuffers) const; private: - //! @brief Setups decoder internal tensors for new speculative decoding request - static void newRequestSpeculativeDecoding(SizeType32 batchIdx, runtime::decoder_batch::Request const& request, - SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig, - DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream, - CudaStream const& decoderStream, SpeculativeDecodingMode const& speculativeDecodingMode, - SizeType32 maxDecodingEngineTokens); - - //! @brief Setups decoder internal tensors for new request in Draft model Sps mode - static void newRequestDraftTokensExternal(SizeType32 batchIdx, runtime::decoder_batch::Request const& request, - SamplingConfig const& samplingConfig, DecodingInput& jointDecodingInput, CudaStream const& decoderStream); - - //! @brief Setups decoder internal tensors for new Medusa request - static void newRequestMedusa(SizeType32 batchIdx, runtime::decoder_batch::Request const& request, - DecodingInput& jointDecodingInput, CudaStream const& decoderStream, SizeType32 maxDecodingEngineTokens); - - //! @brief Setups decoder internal tensors for new Lookahead request - static void newRequestLookahead(SizeType32 batchIdx, runtime::decoder_batch::Request const& request, - DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream); - - //! @brief Setups decoder internal tensors for new Explicit draft tokens request - static void newRequestExplicitDraftTokens(SizeType32 batchIdx, runtime::decoder_batch::Request const& request, - DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream); - - //! @brief Setups decoder internal tensors for new Eagle request - static void newRequestEagle(SizeType32 batchIdx, runtime::decoder_batch::Request const& request, - runtime::ModelConfig const& modelConfig, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream); - - [[nodiscard]] std::shared_ptr retrieveDraftLogits(runtime::ModelConfig const& modelConfig, - runtime::WorldConfig const& worldConfig, std::shared_ptr const& tensor, - runtime::BufferManager const& bufferManager) const; - bool mSpeculativeDecodingFastLogits; bool mIsLeaderInOrchMode; bool mIsNormalizeLogProbs; diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h index e4d13c9e17..f069e3ac7f 100644 --- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h +++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h @@ -1110,7 +1110,7 @@ public: [[nodiscard]] SizeType32 getNumDraftTokens() const { - return mDraftTokens->size(); + return hasDraftTokens() ? mDraftTokens->size() : 0; } void discardDraftTokens(SizeType32 numTokensToDiscard) diff --git a/cpp/include/tensorrt_llm/runtime/decodingInput.h b/cpp/include/tensorrt_llm/runtime/decodingInput.h index deeb0fa0af..4344f423ac 100644 --- a/cpp/include/tensorrt_llm/runtime/decodingInput.h +++ b/cpp/include/tensorrt_llm/runtime/decodingInput.h @@ -102,11 +102,13 @@ public: { public: TensorPtr draftLogits; + TensorPtr draftLogitsHost; TensorPtr draftProbs; TensorPtr targetProbs; TensorPtr numDraftTokens; TensorPtr numDraftTokensHost; TensorPtr draftTokenIds; + TensorPtr draftTokenIdsHost; TensorPtr useDraftLogits; TensorPtr useDraftLogitsHost; diff --git a/cpp/include/tensorrt_llm/runtime/request.h b/cpp/include/tensorrt_llm/runtime/request.h deleted file mode 100644 index e8f851b7d7..0000000000 --- a/cpp/include/tensorrt_llm/runtime/request.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "tensorrt_llm/executor/executor.h" -#include "tensorrt_llm/runtime/iTensor.h" - -#include - -namespace tensorrt_llm::runtime::decoder_batch -{ - -class Request -{ -public: - using TensorConstPtr = ITensor::SharedConstPtr; - using TensorPtr = ITensor::SharedPtr; - using BufferPtr = IBuffer::SharedPtr; - - explicit Request(SizeType32 inputLen) - : inputLen(inputLen) - { - } - - //! Mandatory parameters - SizeType32 inputLen; // Input length without draft tokens, increasing with generation steps - - // optional parameters - SizeType32 generatedTokensPerEngineStep{1}; // - - //! Optional parameters for speculative decoding - BufferPtr draftTokens; // [generatedTokensPerEngineStep - 1] on gpu - std::optional draftLogits; // [generatedTokensPerEngineStep - 1, vocabSize] on gpu - TensorPtr medusaPaths; // [maxDecodingTokens, maxPathLen], on gpu - TensorPtr medusaTreeIds; // [maxDecodingTokens], on gpu - std::optional lookaheadRuntimeConfig; - std::optional eagleConfig; -}; - -} // namespace tensorrt_llm::runtime::decoder_batch diff --git a/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp b/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp index 16771709bb..3335d69a01 100644 --- a/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp +++ b/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp @@ -20,11 +20,14 @@ #include "tensorrt_llm/batch_manager/llmRequest.h" #include "tensorrt_llm/batch_manager/medusaBuffers.h" #include "tensorrt_llm/batch_manager/utils/logitsThread.h" +#include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/nvtxUtils.h" +#include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/decoderState.h" #include "tensorrt_llm/runtime/decodingInput.h" #include "tensorrt_llm/runtime/decodingOutput.h" +#include "tensorrt_llm/runtime/iBuffer.h" #include "tensorrt_llm/runtime/runtimeKernels.h" #include "tensorrt_llm/runtime/speculativeDecodingMode.h" #include "tensorrt_llm/runtime/utils/mpiUtils.h" @@ -45,6 +48,8 @@ namespace tensorrt_llm::batch_manager using SizeType32 = CreateNewDecoderRequests::SizeType32; using TensorPtr = CreateNewDecoderRequests::TensorPtr; using SharedConstPtr = CreateNewDecoderRequests::SharedConstPtr; +template +using OptionalRef = tensorrt_llm::common::OptionalRef; namespace { @@ -320,149 +325,165 @@ void initializeOutputs(DecodingOutput& dJointOutput, SizeType32 batchSlot, SizeT TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } -} // namespace - -void CreateNewDecoderRequests::newRequestSpeculativeDecoding(SizeType32 batchIdx, - runtime::decoder_batch::Request const& request, SamplingConfig const& samplingConfig, - runtime::ModelConfig const& modelConfig, DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, - CudaStream const& runtimeStream, CudaStream const& decoderStream, - SpeculativeDecodingMode const& speculativeDecodingMode, SizeType32 maxDecodingEngineTokens) +void retrieveDraftLogits(TensorPtr& draftLogitsHost, std::shared_ptr const& reqDraftLogits, + ModelConfig const& modelConfig, WorldConfig const& worldConfig, bool speculativeDecodingFastLogits, + bool isLeaderInOrchMode, BufferManager const& bufferManager) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - if (speculativeDecodingMode.predictsDraftTokens()) + if (!speculativeDecodingFastLogits) { - auto const& stream = decoderStream; - BufferManager manager{std::make_shared(stream.get())}; + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); + bufferManager.copy(*reqDraftLogits, *draftLogitsHost); + return; + } - auto& dJointOutput = jointDecodingOutput; + if (isLeaderInOrchMode) + { + // reqDraftLogits contains metadata for fast-logits path; validate size. + auto constexpr fastLogitsInfoSize = sizeof(te::SpeculativeDecodingFastLogitsInfo); + TLLM_CHECK_WITH_INFO(reqDraftLogits->getSizeInBytes() >= fastLogitsInfoSize, + "Draft logits metadata buffer is too small to hold SpeculativeDecodingFastLogitsInfo."); + te::SpeculativeDecodingFastLogitsInfo fastLogitsInfo{}; + std::memcpy(&fastLogitsInfo, reqDraftLogits->data(), fastLogitsInfoSize); + utils::targetModelReceiveLogits(draftLogitsHost, fastLogitsInfo, modelConfig.getLogitsDtype()); - TensorPtr nextDraftTokens - = ITensor::slice(dJointOutput.speculativeDecodingOutputs->nextDraftTokens, batchIdx, 1); - // FIXME: can we skip this? - manager.setZero(*nextDraftTokens); - if (speculativeDecodingMode.variableDraftLength()) + // Broadcast to other ranks if needed + if (worldConfig.isTensorParallel()) { - TensorPtr nextDraftTokensLen - = ITensor::slice(dJointOutput.speculativeDecodingOutputs->nextDraftTokensLen, batchIdx, 1); - manager.setZero(*nextDraftTokensLen); + auto const& commSession = COMM_SESSION; + auto shape = draftLogitsHost->getShape(); + commSession.bcastValue(shape.d[0], 0); + commSession.bcastValue(shape.d[1], 0); + commSession.bcast(draftLogitsHost->data(), draftLogitsHost->getSizeInBytes(), mpi::MpiType::kUINT8, 0); } } + else + { + TLLM_CHECK_WITH_INFO(worldConfig.isTensorParallel(), + "Fast logits path requires tensor-parallel broadcast for non-leader ranks."); - if (speculativeDecodingMode.isDraftTokensExternal()) - { - newRequestDraftTokensExternal(batchIdx, request, samplingConfig, jointDecodingInput, decoderStream); - } - else if (speculativeDecodingMode.isMedusa()) - { - newRequestMedusa(batchIdx, request, jointDecodingInput, decoderStream, maxDecodingEngineTokens); - } - else if (speculativeDecodingMode.isLookaheadDecoding()) - { - newRequestLookahead(batchIdx, request, jointDecodingInput, jointDecodingOutput, runtimeStream); - } - else if (speculativeDecodingMode.isExplicitDraftTokens()) - { - newRequestExplicitDraftTokens(batchIdx, request, jointDecodingOutput, runtimeStream); - } - else if (speculativeDecodingMode.isEagle()) - { - newRequestEagle(batchIdx, request, modelConfig, jointDecodingOutput, runtimeStream); + // Get logits from leader rank + auto const& commSession = COMM_SESSION; + int64_t dims[2]; + commSession.bcastValue(dims[0], 0); + commSession.bcastValue(dims[1], 0); + draftLogitsHost->reshape(ITensor::makeShape({dims[0], dims[1]})); + commSession.bcast(draftLogitsHost->data(), draftLogitsHost->getSizeInBytes(), mpi::MpiType::kUINT8, 0); } + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); -} +}; -void CreateNewDecoderRequests::newRequestDraftTokensExternal(SizeType32 batchIdx, - runtime::decoder_batch::Request const& request, SamplingConfig const& samplingConfig, - DecodingInput& jointDecodingInput, CudaStream const& decoderStream) +//! @brief Setups decoder internal tensors for new request in Draft model Sps mode +void newRequestDraftTokensExternal(DecodingInput& jointDecodingInput, SizeType32 batchIdx, LlmRequest const& llmReq, + SizeType32 numDecodingEngineTokens, runtime::ModelConfig const& modelConfig, WorldConfig const& worldConfig, + bool speculativeDecodingFastLogits, bool isLeaderInOrchMode, CudaStream const& decoderStream) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - BufferManager manager{std::make_shared(decoderStream.get())}; + BufferManager decoderBufferManager{std::make_shared(decoderStream.get())}; - auto& dJointInput = jointDecodingInput; + TLLM_CHECK(jointDecodingInput.externalDraftTokensInputs); + auto& externalDraftTokensInputs = jointDecodingInput.externalDraftTokensInputs; - auto const numDraftTokens = request.generatedTokensPerEngineStep - 1; + auto const& draftTokens = llmReq.getDraftTokens(); + auto const numDraftTokens = numDecodingEngineTokens - 1; - auto const useDraftLogits = request.draftLogits.has_value(); - if (useDraftLogits) - { - TensorPtr draftLogitsView = ITensor::view(request.draftLogits.value()); - - TensorPtr draftLogitsReqBatchSlice - = ITensor::slice(dJointInput.externalDraftTokensInputs->draftLogits, batchIdx, 1); - draftLogitsReqBatchSlice->squeeze(0); - TensorPtr draftLogitsReqTokensSlice = ITensor::slice(draftLogitsReqBatchSlice, 0, numDraftTokens); - manager.copy(*draftLogitsView, *draftLogitsReqTokensSlice); - } - auto* useDraftLogitsHostPtr = runtime::bufferCast(*dJointInput.externalDraftTokensInputs->useDraftLogitsHost); - useDraftLogitsHostPtr[batchIdx] = useDraftLogits; - auto useDraftLogitsView = ITensor::slice(dJointInput.externalDraftTokensInputs->useDraftLogits, batchIdx, 1); - runtime::kernels::invokeFill(*useDraftLogitsView, useDraftLogits, decoderStream); + auto numDraftTokensHostRange = runtime::BufferRange(*externalDraftTokensInputs->numDraftTokensHost); + numDraftTokensHostRange[batchIdx] = numDraftTokens; + auto numDraftTokensView = ITensor::slice(externalDraftTokensInputs->numDraftTokens, batchIdx, 1); + runtime::kernels::invokeFill(*numDraftTokensView, numDraftTokens, decoderStream); if (numDraftTokens > 0) { - TensorPtr draftTokensReqBatchSlice - = ITensor::slice(dJointInput.externalDraftTokensInputs->draftTokenIds, batchIdx, 1); - draftTokensReqBatchSlice->squeeze(0); - TensorPtr draftTokensReqTokensSlice = ITensor::slice(draftTokensReqBatchSlice, 0, numDraftTokens); - TensorPtr draftTokensView = ITensor::view(request.draftTokens, ITensor::makeShape({numDraftTokens})); - manager.copy(*draftTokensView, *draftTokensReqTokensSlice); + TensorPtr draftTokenIdsHostSlice + = ITensor::slice(externalDraftTokensInputs->draftTokenIdsHost, {batchIdx, 0}, numDraftTokens); + // Copy to pinned host memory (don't care about stream of bufferManager) + decoderBufferManager.copy(draftTokens->data(), *draftTokenIdsHostSlice); + + TensorPtr draftTokenIdsSlice + = ITensor::slice(externalDraftTokensInputs->draftTokenIds, {batchIdx, 0}, numDraftTokens); + decoderBufferManager.copy(*draftTokenIdsHostSlice, *draftTokenIdsSlice); } - auto* numDraftTokensHostPtr - = runtime::bufferCast(*dJointInput.externalDraftTokensInputs->numDraftTokensHost); - numDraftTokensHostPtr[batchIdx] = numDraftTokens; - auto numDraftTokensView = ITensor::slice(dJointInput.externalDraftTokensInputs->numDraftTokens, batchIdx, 1); - runtime::kernels::invokeFill(*numDraftTokensView, numDraftTokens, decoderStream); + auto const& draftLogits = llmReq.getDraftLogits(); + auto const useDraftLogits = draftLogits.has_value(); + auto useDraftLogitsHostRange = runtime::BufferRange(*externalDraftTokensInputs->useDraftLogitsHost); + useDraftLogitsHostRange[batchIdx] = useDraftLogits; + auto useDraftLogitsView = ITensor::slice(externalDraftTokensInputs->useDraftLogits, batchIdx, 1); + runtime::kernels::invokeFill(*useDraftLogitsView, useDraftLogits, decoderStream); + + if (useDraftLogits) + { + TensorPtr draftLogitsHostSlice + = ITensor::slice(externalDraftTokensInputs->draftLogitsHost, {batchIdx, 0}, numDraftTokens); + retrieveDraftLogits(draftLogitsHostSlice, draftLogits.value(), modelConfig, worldConfig, + speculativeDecodingFastLogits, isLeaderInOrchMode, decoderBufferManager); + + TensorPtr draftLogitsSlice + = ITensor::slice(externalDraftTokensInputs->draftLogits, {batchIdx, 0}, numDraftTokens); + decoderBufferManager.copy(*draftLogitsHostSlice, *draftLogitsSlice); + } + + auto const& samplingConfig = llmReq.mSamplingConfig; bool const useRandomAcceptanceThreshold = !samplingConfig.draftAcceptanceThreshold.has_value(); float const constantThreshold = useRandomAcceptanceThreshold ? 0 : samplingConfig.draftAcceptanceThreshold.value()[0]; - dJointInput.externalDraftTokensInputs->useRandomAcceptanceThreshold = useRandomAcceptanceThreshold; - dJointInput.externalDraftTokensInputs->constantThreshold = constantThreshold; + externalDraftTokensInputs->useRandomAcceptanceThreshold = useRandomAcceptanceThreshold; + externalDraftTokensInputs->constantThreshold = constantThreshold; TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } -void CreateNewDecoderRequests::newRequestMedusa(SizeType32 batchIdx, runtime::decoder_batch::Request const& request, - DecodingInput& jointDecodingInput, CudaStream const& decoderStream, SizeType32 maxDecodingEngineTokens) +//! @brief Setups decoder internal tensors for new Medusa request +void newRequestMedusa(DecodingInput& jointDecodingInput, SizeType32 batchIdx, LlmRequest& llmReq, + SizeType32 numDecodingEngineTokens, SizeType32 maxDecodingEngineTokens, MedusaBuffers const& medusaBuffers, + CudaStream const& decoderStream) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + llmReq.mSamplingConfig.topKMedusaHeads = {medusaBuffers.mTopKs}; + // FIXME: we must set medusa paths and tree ids not from seq slot, but from llmRequest? + // When multiple microbatches buffers are used, runtime buffers can not be addressed with seqSlot. + auto medusaPaths = ITensor::slice(medusaBuffers.medusaPathsDevice, 0, 1); + auto medusaTreeIds = ITensor::slice(medusaBuffers.medusaTreeIdsDevice, 0, 1); + BufferManager manager{std::make_shared(decoderStream.get())}; - auto& dJointInput = jointDecodingInput; + auto& medusaInputs = jointDecodingInput.medusaInputs; TensorPtr curTokensPerStepSlice - = ITensor::slice(constPointerCast(dJointInput.medusaInputs->medusaCurTokensPerStep), batchIdx, 1); + = ITensor::slice(constPointerCast(medusaInputs->medusaCurTokensPerStep), batchIdx, 1); // Context phase Medusa processes 1 token only, new value from targetTokensPerStep will be filled at the end // of first decoder runtime::kernels::invokeFill(*curTokensPerStepSlice, 1, decoderStream); TensorPtr targetTokensPerStepSlice - = ITensor::slice(constPointerCast(dJointInput.medusaInputs->medusaTargetTokensPerStep), batchIdx, 1); - auto const generatedTokensPerEngineStep = request.generatedTokensPerEngineStep; - TLLM_CHECK_WITH_INFO(generatedTokensPerEngineStep <= maxDecodingEngineTokens, - "Tokens per step for (%d) is larger than maximum tokens per step (%d)", generatedTokensPerEngineStep, + = ITensor::slice(constPointerCast(medusaInputs->medusaTargetTokensPerStep), batchIdx, 1); + TLLM_CHECK_WITH_INFO(numDecodingEngineTokens <= maxDecodingEngineTokens, + "Tokens per step for (%d) is larger than maximum tokens per step (%d)", numDecodingEngineTokens, maxDecodingEngineTokens); - runtime::kernels::invokeFill(*targetTokensPerStepSlice, generatedTokensPerEngineStep, decoderStream); + runtime::kernels::invokeFill(*targetTokensPerStepSlice, numDecodingEngineTokens, decoderStream); - TensorPtr pathsSlice = ITensor::slice(constPointerCast(dJointInput.medusaInputs->medusaPaths), batchIdx, 1); - manager.copy(*request.medusaPaths, *pathsSlice); + TensorPtr pathsSlice = ITensor::slice(constPointerCast(medusaInputs->medusaPaths), batchIdx, 1); + manager.copy(*medusaPaths, *pathsSlice); - TensorPtr treeIdsSlice = ITensor::slice(constPointerCast(dJointInput.medusaInputs->medusaTreeIds), batchIdx, 1); - manager.copy(*request.medusaTreeIds, *treeIdsSlice); + TensorPtr treeIdsSlice = ITensor::slice(constPointerCast(medusaInputs->medusaTreeIds), batchIdx, 1); + manager.copy(*medusaTreeIds, *treeIdsSlice); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } -void CreateNewDecoderRequests::newRequestLookahead(SizeType32 batchIdx, runtime::decoder_batch::Request const& request, - DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream) +//! @brief Setups decoder internal tensors for new Lookahead request +void newRequestLookahead(DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, SizeType32 batchIdx, + CudaStream const& runtimeStream) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); TLLM_CHECK(jointDecodingOutput.lookaheadOutputs); + TLLM_CHECK(jointDecodingInput.lookaheadInputs); // The first generation step only generate 1 token. TensorPtr curTokensPerStepSlice @@ -472,65 +493,72 @@ void CreateNewDecoderRequests::newRequestLookahead(SizeType32 batchIdx, runtime: TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } -void CreateNewDecoderRequests::newRequestExplicitDraftTokens(SizeType32 batchIdx, - runtime::decoder_batch::Request const& request, DecodingOutput& jointDecodingOutput, - CudaStream const& runtimeStream) +//! @brief Setups decoder internal tensors for new Explicit draft tokens request +void newRequestExplicitDraftTokens( + DecodingOutput& jointDecodingOutput, SizeType32 batchIdx, LlmRequest const& llmReq, CudaStream const& runtimeStream) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); TLLM_CHECK(jointDecodingOutput.explicitDraftTokensBuffers); + auto const inputLen = llmReq.getPromptLen(); + TensorPtr positionIdsBaseSlice = ITensor::slice(jointDecodingOutput.explicitDraftTokensBuffers->positionIdsBase, batchIdx, 1); - runtime::kernels::invokeFill(*positionIdsBaseSlice, request.inputLen, runtimeStream); + runtime::kernels::invokeFill(*positionIdsBaseSlice, inputLen, runtimeStream); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } -void CreateNewDecoderRequests::newRequestEagle(SizeType32 batchIdx, runtime::decoder_batch::Request const& request, - runtime::ModelConfig const& modelConfig, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream) +//! @brief Setups decoder internal tensors for new Eagle request +void newRequestEagle(DecodingOutput& jointDecodingOutput, SizeType32 batchIdx, LlmRequest const& llmReq, + runtime::ModelConfig const& modelConfig, executor::DecodingConfig const& decodingConfig, + CudaStream const& runtimeStream) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); TLLM_CHECK(jointDecodingOutput.eagleBuffers); + auto& eagleBuffers = *jointDecodingOutput.eagleBuffers; + + auto const inputLen = llmReq.getPromptLen(); BufferManager manager{std::make_shared(runtimeStream.get())}; - TensorPtr eagleNetCtxRequestTypesHostSlice - = ITensor::slice(jointDecodingOutput.eagleBuffers->eagleNetCtxRequestTypesHost, batchIdx, 1); + TensorPtr eagleNetCtxRequestTypesHostSlice = ITensor::slice(eagleBuffers.eagleNetCtxRequestTypesHost, batchIdx, 1); TensorPtr eagleNetCtxContextLengthsHostSlice - = ITensor::slice(jointDecodingOutput.eagleBuffers->eagleNetCtxContextLengthsHost, batchIdx, 1); + = ITensor::slice(eagleBuffers.eagleNetCtxContextLengthsHost, batchIdx, 1); TensorPtr eagleNetCtxPastKeyValueLengthsHostSlice - = ITensor::slice(jointDecodingOutput.eagleBuffers->eagleNetCtxPastKeyValueLengthsHost, batchIdx, 1); + = ITensor::slice(eagleBuffers.eagleNetCtxPastKeyValueLengthsHost, batchIdx, 1); runtime::bufferCast(*eagleNetCtxRequestTypesHostSlice)[0] = 0; - runtime::bufferCast(*eagleNetCtxContextLengthsHostSlice)[0] = request.inputLen; - runtime::bufferCast(*eagleNetCtxPastKeyValueLengthsHostSlice)[0] = request.inputLen; + runtime::bufferCast(*eagleNetCtxContextLengthsHostSlice)[0] = inputLen; + runtime::bufferCast(*eagleNetCtxPastKeyValueLengthsHostSlice)[0] = inputLen; - TensorPtr eagleNetGenRequestTypesHostSlice - = ITensor::slice(jointDecodingOutput.eagleBuffers->eagleNetGenRequestTypesHost, batchIdx, 1); + TensorPtr eagleNetGenRequestTypesHostSlice = ITensor::slice(eagleBuffers.eagleNetGenRequestTypesHost, batchIdx, 1); TensorPtr eagleNetGenContextLengthsHostSlice - = ITensor::slice(jointDecodingOutput.eagleBuffers->eagleNetGenContextLengthsHost, batchIdx, 1); + = ITensor::slice(eagleBuffers.eagleNetGenContextLengthsHost, batchIdx, 1); TensorPtr eagleNetGenPastKeyValueLengthsHostSlice - = ITensor::slice(jointDecodingOutput.eagleBuffers->eagleNetGenPastKeyValueLengthsHost, batchIdx, 1); + = ITensor::slice(eagleBuffers.eagleNetGenPastKeyValueLengthsHost, batchIdx, 1); runtime::bufferCast(*eagleNetGenRequestTypesHostSlice)[0] = 1; - runtime::bufferCast(*eagleNetGenContextLengthsHostSlice)[0] = request.inputLen; - runtime::bufferCast(*eagleNetGenPastKeyValueLengthsHostSlice)[0] = request.inputLen; + runtime::bufferCast(*eagleNetGenContextLengthsHostSlice)[0] = inputLen; + runtime::bufferCast(*eagleNetGenPastKeyValueLengthsHostSlice)[0] = inputLen; auto const eagleModule = std::dynamic_pointer_cast( modelConfig.getSpeculativeDecodingModulePtr()); std::optional eagleChoicesOpt; - if (request.eagleConfig) + auto const& eagleConfig = llmReq.getEagleConfig() ? llmReq.getEagleConfig() : decodingConfig.getEagleConfig(); + + if (eagleConfig) { - eagleChoicesOpt = request.eagleConfig->getEagleChoices(); + eagleChoicesOpt = eagleConfig->getEagleChoices(); } - if (!request.eagleConfig || !request.eagleConfig->useDynamicTree()) + if (!eagleConfig || !eagleConfig->useDynamicTree()) { - TensorPtr draftPathsHostSlice = ITensor::slice(jointDecodingOutput.eagleBuffers->draftPathsHost, batchIdx, 1); - TensorPtr draftPathsSlice = ITensor::slice(jointDecodingOutput.eagleBuffers->draftPaths, batchIdx, 1); + TensorPtr draftPathsHostSlice = ITensor::slice(eagleBuffers.draftPathsHost, batchIdx, 1); + TensorPtr draftPathsSlice = ITensor::slice(eagleBuffers.draftPaths, batchIdx, 1); // eagleConfig is nullptr or Eagle-1 std::vector topKs; @@ -546,6 +574,61 @@ void CreateNewDecoderRequests::newRequestEagle(SizeType32 batchIdx, runtime::dec TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } +//! @brief Setups decoder internal tensors for new speculative decoding request +void newRequestSpeculativeDecoding(DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, + SizeType32 batchIdx, LlmRequest& llmReq, SpeculativeDecodingMode const& speculativeDecodingMode, + SizeType32 numDecodingEngineTokens, SizeType32 maxDecodingEngineTokens, + OptionalRef medusaBuffers, runtime::ModelConfig const& modelConfig, + WorldConfig const& worldConfig, executor::DecodingConfig const& decodingConfig, bool speculativeDecodingFastLogits, + bool isLeaderInOrchMode, CudaStream const& runtimeStream, CudaStream const& decoderStream) +{ + TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); + + if (speculativeDecodingMode.predictsDraftTokens()) + { + BufferManager manager{std::make_shared(decoderStream.get())}; + + TLLM_CHECK(jointDecodingOutput.speculativeDecodingOutputs); + auto& speculativeDecodingOutputs = *jointDecodingOutput.speculativeDecodingOutputs; + + TensorPtr nextDraftTokens = ITensor::slice(speculativeDecodingOutputs.nextDraftTokens, batchIdx, 1); + // FIXME: can we skip this? + manager.setZero(*nextDraftTokens); + if (speculativeDecodingMode.variableDraftLength()) + { + TensorPtr nextDraftTokensLen = ITensor::slice(speculativeDecodingOutputs.nextDraftTokensLen, batchIdx, 1); + manager.setZero(*nextDraftTokensLen); + } + } + + if (speculativeDecodingMode.isDraftTokensExternal()) + { + newRequestDraftTokensExternal(jointDecodingInput, batchIdx, llmReq, numDecodingEngineTokens, modelConfig, + worldConfig, speculativeDecodingFastLogits, isLeaderInOrchMode, decoderStream); + } + else if (speculativeDecodingMode.isMedusa()) + { + TLLM_CHECK(medusaBuffers); + newRequestMedusa(jointDecodingInput, batchIdx, llmReq, numDecodingEngineTokens, maxDecodingEngineTokens, + medusaBuffers.value(), decoderStream); + } + else if (speculativeDecodingMode.isLookaheadDecoding()) + { + newRequestLookahead(jointDecodingInput, jointDecodingOutput, batchIdx, runtimeStream); + } + else if (speculativeDecodingMode.isExplicitDraftTokens()) + { + newRequestExplicitDraftTokens(jointDecodingOutput, batchIdx, llmReq, runtimeStream); + } + else if (speculativeDecodingMode.isEagle()) + { + newRequestEagle(jointDecodingOutput, batchIdx, llmReq, modelConfig, decodingConfig, runtimeStream); + } + TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); +} + +} // namespace + std::tuple, std::vector> CreateNewDecoderRequests::createDecoderRequests(RequestVector const& finishedContextRequests, TensorPtr const& inputIds, executor::DecodingConfig const& decodingConfig, runtime::decoder::DecoderState& decoderState, @@ -563,9 +646,6 @@ CreateNewDecoderRequests::createDecoderRequests(RequestVector const& finishedCon } inputIds->resize(decoderInputSize); - std::vector decoderRequests; - decoderRequests.reserve(finishedContextRequests.size()); - std::vector lookaheadPrompt; std::vector lookaheadAlgoConfigs; if (modelConfig.getSpeculativeDecodingMode().isLookaheadDecoding()) @@ -597,36 +677,18 @@ CreateNewDecoderRequests::createDecoderRequests(RequestVector const& finishedCon auto const promptLen = llmReq->getPromptLen(); - auto decoderRequest = decoder_batch::Request{promptLen}; - + SizeType32 numDecodingEngineTokens{1}; if (modelConfig.getSpeculativeDecodingMode().isDraftTokensExternal()) { - if (llmReq->hasDraftTokens()) - { - auto const& draftTokens = llmReq->getDraftTokens(); - // Copy to pinned host memory (don't care about stream of bufferManager) - decoderRequest.draftTokens = decoderBufferManager.copyFrom(*draftTokens, MemoryType::kPINNEDPOOL); - auto const& draftLogits = llmReq->getDraftLogits(); - if (draftLogits.has_value()) - { - decoderRequest.draftLogits - = retrieveDraftLogits(modelConfig, worldConfig, draftLogits.value(), decoderBufferManager); - } - decoderRequest.generatedTokensPerEngineStep = draftTokens->size() + 1; - } - else - { - decoderRequest.generatedTokensPerEngineStep = 1; - } + numDecodingEngineTokens = llmReq->getNumDraftTokens() + 1; } else if (!modelConfig.getSpeculativeDecodingMode().isNone()) { - decoderRequest.generatedTokensPerEngineStep = modelConfig.getMaxDecodingTokens(); + numDecodingEngineTokens = modelConfig.getMaxDecodingTokens(); } auto& dJointInput = decoderState.getJointDecodingInput(); - auto const numDecodingEngineTokens = decoderRequest.generatedTokensPerEngineStep; initializeInputLengths(dJointInput, batchSlot, promptLen, llmReq->mMaxNewTokens, numDecodingEngineTokens, maxSequenceLength, decoderBufferManager); decoderState.setNumDecodingEngineTokens(batchSlot, numDecodingEngineTokens); @@ -667,16 +729,7 @@ CreateNewDecoderRequests::createDecoderRequests(RequestVector const& finishedCon { TLLM_CHECK(beamWidth == 1); - if (modelConfig.getSpeculativeDecodingMode().isMedusa()) - { - TLLM_CHECK(medusaBuffers); - llmReq->mSamplingConfig.topKMedusaHeads = {medusaBuffers->mTopKs}; - // FIXME: we must set medusa paths and tree ids not from seq slot, but from llmRequest? - // When multiple microbatches buffers are used, runtime buffers can not be addressed with seqSlot. - decoderRequest.medusaPaths = ITensor::slice(medusaBuffers->medusaPathsDevice, 0, 1); - decoderRequest.medusaTreeIds = ITensor::slice(medusaBuffers->medusaTreeIdsDevice, 0, 1); - } - else if (modelConfig.getSpeculativeDecodingMode().isLookaheadDecoding()) + if (modelConfig.getSpeculativeDecodingMode().isLookaheadDecoding()) { lookaheadPrompt.emplace_back(requestIds); @@ -684,67 +737,17 @@ CreateNewDecoderRequests::createDecoderRequests(RequestVector const& finishedCon = llmReq->getLookaheadConfig().value_or(decodingConfig.getLookaheadDecodingConfig().value()); lookaheadAlgoConfigs.emplace_back(lookaheadRuntimeConfig); } - else if (modelConfig.getSpeculativeDecodingMode().isEagle()) - { - decoderRequest.eagleConfig - = llmReq->getEagleConfig() ? llmReq->getEagleConfig() : decodingConfig.getEagleConfig(); - } - newRequestSpeculativeDecoding(batchSlot, decoderRequest, samplingConfig, modelConfig, - decoderState.getJointDecodingInput(), decoderState.getJointDecodingOutput(), runtimeStream, - decoderStream, decoderState.getSpeculativeDecodingMode(), decoderState.getMaxDecodingEngineTokens()); + newRequestSpeculativeDecoding(decoderState.getJointDecodingInput(), decoderState.getJointDecodingOutput(), + batchSlot, *llmReq, decoderState.getSpeculativeDecodingMode(), numDecodingEngineTokens, + decoderState.getMaxDecodingEngineTokens(), medusaBuffers, modelConfig, worldConfig, decodingConfig, + mSpeculativeDecodingFastLogits, mIsLeaderInOrchMode, runtimeStream, decoderStream); } - decoderRequests.push_back(decoderRequest); - inputOffset += promptLen; } return {std::move(lookaheadPrompt), std::move(lookaheadAlgoConfigs)}; } -std::shared_ptr CreateNewDecoderRequests::retrieveDraftLogits(ModelConfig const& modelConfig, - WorldConfig const& worldConfig, std::shared_ptr const& tensor, - BufferManager const& bufferManager) const -{ - TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); - - if (!mSpeculativeDecodingFastLogits) - { - TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); - return bufferManager.copyFrom(*tensor, MemoryType::kPINNEDPOOL); - } - - if (mIsLeaderInOrchMode) - { - te::SpeculativeDecodingFastLogitsInfo fastLogitsInfo; - std::memcpy(&fastLogitsInfo, tensor->data(), sizeof(fastLogitsInfo)); - auto logits = utils::targetModelReceiveLogits(fastLogitsInfo, modelConfig).value(); - - // Broadcast to other ranks if needed - if (worldConfig.isTensorParallel()) - { - auto const& commSession = COMM_SESSION; - auto shape = logits->getShape(); - commSession.bcastValue(shape.d[0], 0); - commSession.bcastValue(shape.d[1], 0); - commSession.bcast(logits->data(), logits->getSizeInBytes(), mpi::MpiType::kUINT8, 0); - } - TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); - return logits; - } - - // Get logits from leader rank - auto const& commSession = COMM_SESSION; - int64_t dims[2]; - commSession.bcastValue(dims[0], 0); - commSession.bcastValue(dims[1], 0); - auto const logitsDtype = modelConfig.getLogitsDtype(); - auto logits = tensorrt_llm::runtime::BufferManager::pinnedPool(ITensor::makeShape({dims[0], dims[1]}), logitsDtype); - commSession.bcast(logits->data(), logits->getSizeInBytes(), mpi::MpiType::kUINT8, 0); - - TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); - return logits; -}; - } // namespace tensorrt_llm::batch_manager diff --git a/cpp/tensorrt_llm/batch_manager/utils/logitsThread.cpp b/cpp/tensorrt_llm/batch_manager/utils/logitsThread.cpp index 484cd7c3c7..7234ca9ba5 100644 --- a/cpp/tensorrt_llm/batch_manager/utils/logitsThread.cpp +++ b/cpp/tensorrt_llm/batch_manager/utils/logitsThread.cpp @@ -121,8 +121,8 @@ void draftModelSendLogitsThread(int device, std::atomic* draftModelThreadS #endif // ENABLE_MULTI_DEVICE } -std::optional targetModelReceiveLogits( - executor::SpeculativeDecodingFastLogitsInfo const& fastLogitsInfo, runtime::ModelConfig const& modelConfig) +void targetModelReceiveLogits(runtime::ITensor::SharedPtr& draftLogitsHost, + executor::SpeculativeDecodingFastLogitsInfo const& fastLogitsInfo, nvinfer1::DataType logitsDtype) { #if ENABLE_MULTI_DEVICE auto const& worldComm = tensorrt_llm::mpi::MpiComm::world(); @@ -151,10 +151,7 @@ std::optional targetModelReceiveLogits( int64_t dims[2]; MPICHECK(MPI_Mrecv(&dims, count, MPI_INT64_T, &msg, &status)); - auto const logitsDtype = modelConfig.getLogitsDtype(); - - auto tensor = tensorrt_llm::runtime::BufferManager::pinnedPool( - runtime::ITensor::makeShape({dims[0], dims[1]}), logitsDtype); + draftLogitsHost->reshape(runtime::ITensor::makeShape({dims[0], dims[1]})); worldComm.mprobe(fastLogitsInfo.draftParticipantId, mpi::MpiTag::kSpecDecLogitsData, &msg, &status); @@ -163,11 +160,7 @@ std::optional targetModelReceiveLogits( uint64_t const expectedSize = static_cast(dims[0]) * dims[1] * tc::getDTypeSize(logitsDtype); TLLM_CHECK((uint64_t) count == expectedSize); - MPICHECK(MPI_Mrecv(tensor->data(), count, MPI_UINT8_T, &msg, &status)); - - return tensor; -#else - return std::nullopt; + MPICHECK(MPI_Mrecv(draftLogitsHost->data(), count, MPI_UINT8_T, &msg, &status)); #endif // ENABLE_MULTI_DEVICE } diff --git a/cpp/tensorrt_llm/batch_manager/utils/logitsThread.h b/cpp/tensorrt_llm/batch_manager/utils/logitsThread.h index 6d87ebee16..f19d5f5ef3 100644 --- a/cpp/tensorrt_llm/batch_manager/utils/logitsThread.h +++ b/cpp/tensorrt_llm/batch_manager/utils/logitsThread.h @@ -21,10 +21,8 @@ #include "tensorrt_llm/executor/executor.h" #include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/iTensor.h" -#include "tensorrt_llm/runtime/modelConfig.h" #include -#include namespace tensorrt_llm::batch_manager { @@ -52,7 +50,7 @@ void draftModelSendLogitsThread(int device, std::atomic* draftModelThreadS std::shared_ptr const& crossKvCacheManager, std::shared_ptr const& peftCacheManager); -std::optional targetModelReceiveLogits( - executor::SpeculativeDecodingFastLogitsInfo const& fastLogitsInfo, runtime::ModelConfig const& modelConfig); +void targetModelReceiveLogits(runtime::ITensor::SharedPtr& draftLogitsHost, + executor::SpeculativeDecodingFastLogitsInfo const& fastLogitsInfo, nvinfer1::DataType logitsDtype); } // namespace tensorrt_llm::batch_manager::utils diff --git a/cpp/tensorrt_llm/runtime/decoderState.cpp b/cpp/tensorrt_llm/runtime/decoderState.cpp index abccbe60a1..b5851dc1c2 100644 --- a/cpp/tensorrt_llm/runtime/decoderState.cpp +++ b/cpp/tensorrt_llm/runtime/decoderState.cpp @@ -131,6 +131,7 @@ void DecoderState::setupSpeculativeDecodingBuffers( mSpeculativeDecodingMode = speculativeDecodingMode; + auto constexpr nvTokenIdType = TRTDataType::value; auto constexpr nvSizeType = TRTDataType::value; auto& dInput = mJointDecodingInput; @@ -179,6 +180,7 @@ void DecoderState::setupSpeculativeDecodingBuffers( DecodingInput::ExternalDraftTokensInputs externalDraftTokensInputs; externalDraftTokensInputs.draftLogits = bufferManager.emptyTensor(MemoryType::kGPU, dtype); + externalDraftTokensInputs.draftLogitsHost = bufferManager.emptyTensor(MemoryType::kPINNEDPOOL, dtype); externalDraftTokensInputs.draftProbs = bufferManager.emptyTensor(MemoryType::kGPU, dtype); externalDraftTokensInputs.targetProbs = bufferManager.emptyTensor(MemoryType::kGPU, dtype); externalDraftTokensInputs.numDraftTokens = bufferManager.emptyTensor(MemoryType::kGPU, nvSizeType); @@ -187,8 +189,8 @@ void DecoderState::setupSpeculativeDecodingBuffers( = bufferManager.emptyTensor(MemoryType::kGPU, TRTDataType::value); externalDraftTokensInputs.useDraftLogitsHost = bufferManager.emptyTensor(MemoryType::kPINNEDPOOL, TRTDataType::value); - externalDraftTokensInputs.draftTokenIds - = bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32); + externalDraftTokensInputs.draftTokenIds = bufferManager.emptyTensor(MemoryType::kGPU, nvTokenIdType); + externalDraftTokensInputs.draftTokenIdsHost = bufferManager.emptyTensor(MemoryType::kPINNEDPOOL, nvTokenIdType); dInput->externalDraftTokensInputs = externalDraftTokensInputs; } @@ -366,10 +368,16 @@ void DecoderState::reshapeSpeculativeDecodingBuffers(SpeculativeDecodingMode con {mMaxNumSequences, mMaxDecodingEngineTokens, mMaxBeamWidth, static_cast(vocabSizePadded)}); dInput.externalDraftTokensInputs->draftProbs->reshape(probsShape); dInput.externalDraftTokensInputs->targetProbs->reshape(probsShape); - dInput.externalDraftTokensInputs->draftLogits->reshape( - ITensor::makeShape({mMaxNumSequences, mMaxDecodingEngineTokens, static_cast(vocabSizePadded)})); - dInput.externalDraftTokensInputs->draftTokenIds->reshape( - ITensor::makeShape({mMaxNumSequences, mMaxDecodingEngineTokens})); + + auto const logitsShape = ITensor::makeShape( + {mMaxNumSequences, mMaxDecodingEngineTokens, static_cast(vocabSizePadded)}); + dInput.externalDraftTokensInputs->draftLogits->reshape(logitsShape); + dInput.externalDraftTokensInputs->draftLogitsHost->reshape(logitsShape); + + auto const tokenIdsShape = ITensor::makeShape({mMaxNumSequences, mMaxDecodingEngineTokens}); + dInput.externalDraftTokensInputs->draftTokenIds->reshape(tokenIdsShape); + dInput.externalDraftTokensInputs->draftTokenIdsHost->reshape(tokenIdsShape); + dInput.externalDraftTokensInputs->numDraftTokens->reshape(maxNumSequencesShape); dInput.externalDraftTokensInputs->numDraftTokensHost->reshape(maxNumSequencesShape); dInput.externalDraftTokensInputs->useDraftLogits->reshape(maxNumSequencesShape); From b36460d7b5697c9f01628e8573fb50de15be49f5 Mon Sep 17 00:00:00 2001 From: Izzy Putterman Date: Fri, 22 Aug 2025 09:57:17 -0700 Subject: [PATCH 31/33] [None][feat] Deepseek: Start Eagle work (#6210) Signed-off-by: Izzy Putterman Co-authored-by: Mike Iovine --- .../_torch/models/modeling_deepseekv3.py | 34 +++++- .../_torch/models/modeling_speculative.py | 10 +- .../_torch/pyexecutor/py_executor_creator.py | 7 +- tensorrt_llm/_torch/speculative/eagle3.py | 52 +++++---- tensorrt_llm/_torch/speculative/interface.py | 7 ++ tensorrt_llm/_torch/speculative/utils.py | 2 + tensorrt_llm/llmapi/llm_args.py | 19 +++- .../_torch/speculative/test_eagle3.py | 105 ++++++++++++++++++ 8 files changed, 201 insertions(+), 35 deletions(-) diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py index 8eb9acfada..c9b9fa979f 100644 --- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py +++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py @@ -65,7 +65,7 @@ from ..modules.linear import Linear, TensorParallelMode, WeightsLoadingConfig from ..modules.multi_stream_utils import maybe_execute_in_parallel from ..modules.rms_norm import RMSNorm from ..peft.lora.layer import LoraLayer -from ..speculative import MTPSpecMetadata, SpecMetadata +from ..speculative import SpecMetadata from ..utils import AuxStreamType, EventType, Fp4QuantizedTensor from .modeling_speculative import SpecDecOneEngineForCausalLM from .modeling_utils import (DecoderModel, EagerFusionConfig, filter_weights, @@ -230,7 +230,7 @@ class DeepseekV3Attention(MLA): aux_stream: Optional[torch.cuda.Stream] = None, ): config = model_config.pretrained_config - predicted_tokens_per_seq = model_config.spec_config.num_nextn_predict_layers + 1 if model_config.spec_config is not None else 1 + predicted_tokens_per_seq = model_config.spec_config.max_draft_len + 1 if model_config.spec_config is not None else 1 super().__init__(hidden_size=config.hidden_size, num_attention_heads=config.num_attention_heads, num_key_value_heads=config.num_key_value_heads, @@ -750,6 +750,7 @@ class DeepseekV3DecoderLayer(DecoderLayer): hidden_states: torch.Tensor, attn_metadata: AttentionMetadata, residual: torch.Tensor, + spec_metadata: Optional[SpecMetadata] = None, **kwargs, ) -> Tuple[torch.Tensor, torch.Tensor]: if residual is None: @@ -765,16 +766,24 @@ class DeepseekV3DecoderLayer(DecoderLayer): **kwargs, ) if isinstance(self.mlp, Deepseekv3MoE): + if spec_metadata is not None and spec_metadata.is_layer_capture( + self.layer_idx): + self.fusion_config.POST_MOE_FUSION = False return self.forward_MoE( hidden_states=hidden_states, attn_metadata=attn_metadata, residual=residual, + spec_metadata=spec_metadata, ) else: + if spec_metadata is not None and spec_metadata.is_layer_capture( + self.layer_idx): + self.fusion_config.POST_MLP_FUSION = False assert isinstance(self.mlp, GatedMLP) return self.forward_mlp( hidden_states=hidden_states, residual=residual, + spec_metadata=spec_metadata, ) def forward_MoE( @@ -782,6 +791,7 @@ class DeepseekV3DecoderLayer(DecoderLayer): hidden_states: torch.Tensor, attn_metadata: AttentionMetadata, residual: torch.Tensor, + spec_metadata: Optional[SpecMetadata] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: def _run_MoE(hidden_states, hidden_states_fp4, do_finalize): @@ -856,6 +866,10 @@ class DeepseekV3DecoderLayer(DecoderLayer): hidden_states, residual = self.moe_allreduce( fc2_output, all_reduce_params=moe_all_reduce_params) else: + if spec_metadata is not None and spec_metadata.is_layer_capture( + self.layer_idx): + spec_metadata.maybe_capture_hidden_states( + self.layer_idx, hidden_states, residual) if self.next_layer_layernorm is not None: hidden_states, residual = self.next_layer_layernorm( hidden_states, residual) @@ -866,6 +880,7 @@ class DeepseekV3DecoderLayer(DecoderLayer): self, hidden_states: torch.Tensor, residual: torch.Tensor, + spec_metadata: Optional[SpecMetadata] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: if self.fusion_config.PRE_MLP_FUSION: @@ -903,6 +918,10 @@ class DeepseekV3DecoderLayer(DecoderLayer): ), ) else: + if spec_metadata is not None and spec_metadata.is_layer_capture( + self.layer_idx): + spec_metadata.maybe_capture_hidden_states( + self.layer_idx, hidden_states, residual) if self.next_layer_layernorm is not None: hidden_states, residual = self.next_layer_layernorm( hidden_states, residual) @@ -1105,6 +1124,7 @@ class DeepseekV3Model(DecoderModel): hidden_states=hidden_states, attn_metadata=attn_metadata, residual=residual, + spec_metadata=spec_metadata, ) return hidden_states @@ -1132,7 +1152,8 @@ class DeepseekV3ForCausalLM(SpecDecOneEngineForCausalLM[DeepseekV3Model, model_config=model_config) self.model_nextn = 0 - if model_config.spec_config is not None: + if model_config.spec_config is not None and model_config.spec_config.spec_dec_mode.is_mtp( + ): model_nextn = model_config.spec_config.num_nextn_predict_layers ckpt_nextn = self.config.num_nextn_predict_layers self.num_hidden_layers = self.config.num_hidden_layers @@ -1167,11 +1188,10 @@ class DeepseekV3ForCausalLM(SpecDecOneEngineForCausalLM[DeepseekV3Model, input_ids: torch.IntTensor = None, position_ids: Optional[torch.IntTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, - spec_metadata: Optional[MTPSpecMetadata] = None, + spec_metadata: Optional[SpecMetadata] = None, return_context_logits: bool = False, **kwargs, ) -> torch.Tensor: - attn_metadata.num_generations_per_batch = self.model_nextn + 1 return super().forward(attn_metadata=attn_metadata, input_ids=input_ids, position_ids=position_ids, @@ -1313,7 +1333,9 @@ class DeepseekV3ForCausalLM(SpecDecOneEngineForCausalLM[DeepseekV3Model, for name, module in tqdm(all_named_modules.items(), desc="Loading weights"): - if len(module._parameters) > 0: + if len(module._parameters) <= 0 or name.startswith("draft_model"): + continue + else: names = name.split('.') parent_module_name = '.'.join(names[:-1]) if "model.layers" in name and int( diff --git a/tensorrt_llm/_torch/models/modeling_speculative.py b/tensorrt_llm/_torch/models/modeling_speculative.py index f82c3b4de0..56a489c963 100644 --- a/tensorrt_llm/_torch/models/modeling_speculative.py +++ b/tensorrt_llm/_torch/models/modeling_speculative.py @@ -155,10 +155,12 @@ class Eagle3DraftModel(DecoderModel): else: self.hidden_size_in = config.hidden_size - self.fc = Linear(self.hidden_size_in * 3, - config.hidden_size, - bias=getattr(config, "bias", False), - dtype=config.torch_dtype) + if self.spec_config.num_capture_layers > 1: + self.fc = Linear(self.hidden_size_in * + self.spec_config.num_capture_layers, + config.hidden_size, + bias=getattr(config, "bias", False), + dtype=config.torch_dtype) self.midlayer = Eagle3DecoderLayer(model_config, start_layer_idx) diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py index 12686728cd..ac3bb7a9f5 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py @@ -23,7 +23,7 @@ from ..speculative import (get_num_extra_kv_tokens, get_spec_drafter, get_spec_resource_manager) from ._util import (KvCacheCreator, _adjust_torch_mem_fraction, create_py_executor_instance, instantiate_sampler, is_mla) -from .config import PyTorchConfig +from .config import LoadFormat, PyTorchConfig from .config_utils import is_mla from .guided_decoder import GuidedDecoder from .model_engine import PyTorchModelEngine @@ -252,13 +252,16 @@ def create_py_executor( with mem_monitor.observe_creation_stage( _ExecutorCreationStage.MODEL_ENGINE_DRAFT): draft_spec_config = copy.copy(spec_config) + draft_pytorch_backend_config = copy.copy(pytorch_backend_config) + if spec_config.load_format == "dummy": + draft_pytorch_backend_config.load_format = LoadFormat.DUMMY # The draft model won't have any draft tokens attached to # generation requests when we invoke it autoregressively draft_spec_config.max_draft_len = 0 draft_model_engine = PyTorchModelEngine( model_path=spec_config.speculative_model_dir, - pytorch_backend_config=pytorch_backend_config, + pytorch_backend_config=draft_pytorch_backend_config, batch_size=executor_config.max_batch_size, max_beam_width=executor_config.max_beam_width, max_num_tokens=executor_config.max_num_tokens, diff --git a/tensorrt_llm/_torch/speculative/eagle3.py b/tensorrt_llm/_torch/speculative/eagle3.py index 417becf12f..2d4225641b 100644 --- a/tensorrt_llm/_torch/speculative/eagle3.py +++ b/tensorrt_llm/_torch/speculative/eagle3.py @@ -1,5 +1,5 @@ from dataclasses import dataclass, field -from typing import List, Optional, Tuple +from typing import List, Optional, Set import torch from torch import nn @@ -35,9 +35,10 @@ class Eagle3ResourceManager(BaseResourceManager): # empty hidden states tensor max_num_tokens = min(max_num_tokens, max_num_requests * self.max_seq_len) - self.hidden_states = torch.empty((max_num_tokens, self.hidden_size * 3), - dtype=self.dtype, - device='cuda') + self.hidden_states = torch.empty( + (max_num_tokens, self.hidden_size * config.num_capture_layers), + dtype=self.dtype, + device='cuda') # sequence length, only used for metadata preparation self.seq_lens = {i: 0 for i in range(max_num_requests)} # start indices of each slot @@ -79,8 +80,7 @@ class Eagle3ResourceManager(BaseResourceManager): @dataclass class Eagle3SpecMetadata(SpecMetadata): hidden_states: List[torch.Tensor] = field(default_factory=list) - num_capture_layers: int = 3 - layers_to_capture: Tuple[int, ...] = field(init=False) + layers_to_capture: Optional[Set[int]] = None target_model_embed_tokens: Optional[torch.nn.Module] = None hidden_size: int = 0 max_num_tokens: int = 0 @@ -90,14 +90,19 @@ class Eagle3SpecMetadata(SpecMetadata): eagle3_resource_manager: Optional[Eagle3ResourceManager] = None def __post_init__(self): - if self.num_layers == 1: - self.layers_to_capture = (0, ) - else: - if self.num_layers <= 5: - raise ValueError("Not enough hidden layers for EAGLE") + if self.layers_to_capture is None: + if self.num_layers == 1: + self.layers_to_capture = (self.num_layers - 1, ) + else: + if self.num_layers <= 5: + raise ValueError( + "Not enough hidden layers for default EAGLE3 capture") - self.layers_to_capture = (1, self.num_layers // 2 - 1, - self.num_layers - 4) + self.layers_to_capture = (1, self.num_layers // 2 - 1, + self.num_layers - 4) + else: + self.layers_to_capture = sorted(list(self.layers_to_capture)) + self.num_capture_layers = len(self.layers_to_capture) # Initialize to 0 to avoid reading uninitialized memory during warmup self.hidden_states_read_indices = torch.zeros([self.max_num_tokens], @@ -186,7 +191,7 @@ class Eagle3OneModelSpecMetadata(SpecMetadata): # The hidden states hidden_states: Optional[torch.Tensor] = None # The layers to be captured - layers_to_capture: Tuple[int, ...] = field(init=False) + layers_to_capture: Optional[Set[int]] = None # The hidden size of the hidden states hidden_size: int = 0 # The max number of tokens @@ -197,14 +202,19 @@ class Eagle3OneModelSpecMetadata(SpecMetadata): batch_indices_cuda: Optional[torch.Tensor] = None def __post_init__(self): - if self.num_layers == 1: - self.layers_to_capture = (1, ) - else: - if self.num_layers <= 5: - raise ValueError("Not enough hidden layers for EAGLE") + if self.layers_to_capture is None: + if self.num_layers == 1: + self.layers_to_capture = (self.num_layers - 1, ) + else: + if self.num_layers <= 5: + raise ValueError( + "Not enough hidden layers for default EAGLE3 capture") - self.layers_to_capture = (1, self.num_layers // 2 - 1, - self.num_layers - 4) + self.layers_to_capture = (1, self.num_layers // 2 - 1, + self.num_layers - 4) + else: + self.layers_to_capture = sorted(list(self.layers_to_capture)) + self.num_capture_layers = len(self.layers_to_capture) self.hidden_states = torch.empty( (self.max_num_tokens, self.hidden_size * len(self.layers_to_capture)), diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py index f7cdd92a56..1d306b9029 100644 --- a/tensorrt_llm/_torch/speculative/interface.py +++ b/tensorrt_llm/_torch/speculative/interface.py @@ -185,6 +185,13 @@ class SpecMetadata: cuda_graph_metadata.__post_init__() return cuda_graph_metadata + def is_layer_capture(self, layer_id: int): + """ + Whether the layer should be captured (eg for Eagle3). + By default, does nothing. + """ + return False + def maybe_capture_hidden_states(self, layer_id: int, hidden_states: torch.Tensor, residual: torch.Tensor) -> None: diff --git a/tensorrt_llm/_torch/speculative/utils.py b/tensorrt_llm/_torch/speculative/utils.py index c4a4ccf7e3..16fef4862b 100644 --- a/tensorrt_llm/_torch/speculative/utils.py +++ b/tensorrt_llm/_torch/speculative/utils.py @@ -38,6 +38,7 @@ def get_spec_metadata(spec_config, dtype=model_config.torch_dtype, is_draft_model=is_draft_model, eagle3_resource_manager=spec_resource_manager, + layers_to_capture=spec_config.eagle3_layers_to_capture, ) if spec_config.spec_dec_mode.is_eagle3_one_model(): return Eagle3OneModelSpecMetadata( @@ -47,6 +48,7 @@ def get_spec_metadata(spec_config, num_layers=model_config.num_hidden_layers, hidden_size=model_config.hidden_size, max_num_tokens=max_num_tokens, + layers_to_capture=spec_config.eagle3_layers_to_capture, ) if spec_config.spec_dec_mode.is_draft_target() or \ spec_config.spec_dec_mode.is_ngram() or \ diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index da5071e3b0..6ed4dea76c 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -9,7 +9,7 @@ from dataclasses import dataclass, field from enum import Enum, EnumMeta from pathlib import Path from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Literal, Optional, - Type, TypeAlias, TypeVar, Union, get_args, get_origin) + Set, Type, TypeAlias, TypeVar, Union, get_args, get_origin) import torch import yaml @@ -352,6 +352,7 @@ class DecodingBaseConfig(StrictBaseModel): # When specified, speculation will be disabled at batch sizes above # this value. Otherwise, speculation will always be on. max_concurrency: Optional[int] = None + load_format: Optional[str] = None @classmethod def from_dict(cls, data: dict): @@ -424,6 +425,7 @@ class EagleDecodingConfig(DecodingBaseConfig): num_eagle_layers: Optional[int] = None max_non_leaves_per_layer: Optional[int] = None eagle3_one_model: Optional[bool] = True + eagle3_layers_to_capture: Optional[Set[int]] = None @classmethod def from_dict(cls, data: dict): @@ -443,6 +445,17 @@ class EagleDecodingConfig(DecodingBaseConfig): return TorchSpeculativeDecodingMode.EAGLE3_ONE_MODEL return TorchSpeculativeDecodingMode.EAGLE3 + @functools.cached_property + def num_capture_layers(self): + """ + Returns the number of layers to capture of the target model. + If eagle3_layers_to_capture is not None, return the length of the set. + Otherwise, assume Eagle3 base set and return 3. + """ + if self.eagle3_layers_to_capture is not None: + return len(self.eagle3_layers_to_capture) + return 3 + class UserProvidedDecodingConfig(DecodingBaseConfig): # Cannot use real type annotations due to circular imports @@ -523,7 +536,9 @@ class MTPDecodingConfig(DecodingBaseConfig): @classmethod def from_dict(cls, data: dict): - return cls(**data) + out = cls(**data) + out.max_draft_len = out.num_nextn_predict_layers + return out decoding_type: ClassVar[str] = "MTP" diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py index ffb8e33766..f26fa244f1 100644 --- a/tests/unittest/_torch/speculative/test_eagle3.py +++ b/tests/unittest/_torch/speculative/test_eagle3.py @@ -1,6 +1,9 @@ +import json import os import sys +import tempfile import unittest +from pathlib import Path import pytest import torch @@ -120,5 +123,107 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str, assert text_spec == text_ref +def test_deepseek_eagle3(): + use_cuda_graph = True + attn_backend = "TRTLLM" + disable_overlap_scheduler = False + enable_block_reuse = False + use_one_model = False + enable_chunked_prefill = False + + # Eagle3 one model works with overlap scheduler and block reuse. + total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9 + if total_mem_gb < 150: + pytest.skip("Not enough memory to load target + draft model") + + models_path = llm_models_root() + eagle_config = { + 'architectures': ['LlamaForCausalLMEagle3'], + 'attention_bias': False, + 'attention_dropout': 0.0, + 'bos_token_id': 128000, + 'eos_token_id': [128001, 128008, 128009], + 'eagle_config': { + 'use_aux_hidden_state': False, + 'use_input_layernorm_in_first_layer': True, + 'use_last_layernorm': True, + 'use_mtp_layernorm': False + }, + 'head_dim': 128, + 'hidden_act': 'silu', + 'hidden_size': 2560, + 'initializer_range': 0.02, + 'intermediate_size': 16384, + 'max_position_embeddings': 4096, + 'mlp_bias': False, + 'model_type': 'llama', + 'num_attention_heads': 32, + 'num_eagle_features': 1, + 'num_hidden_layers': 1, + 'num_key_value_heads': 8, + 'pretraining_tp': 1, + 'rms_norm_eps': 1e-05, + 'rope_scaling': { + 'factor': 8.0, + 'high_freq_factor': 4.0, + 'low_freq_factor': 1.0, + 'original_max_position_embeddings': 8192, + 'rope_type': 'llama3' + }, + 'rope_theta': 500000.0, + 'tie_word_embeddings': False, + 'torch_dtype': 'bfloat16', + 'transformers_version': '4.52.4', + 'use_cache': True, + 'vocab_size': 129280, + 'draft_vocab_size': 129280 + } + with tempfile.TemporaryDirectory() as temp_dir: + eagle_model_dir = Path(temp_dir) + config_path = eagle_model_dir / "config.json" + with config_path.open("w") as f: + json.dump(eagle_config, f, indent=2) + target_model_dir = f"{models_path}/DeepSeek-V3-Lite/nvfp4_moe_only" + + # bs > 1 gives non-deterministic when doing IFB. There are slight chances + # that ref and spec does not match 100% + max_batch_size = 16 + max_draft_len = 3 + kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse, + free_gpu_memory_fraction=0.5) + cuda_graph_config = CudaGraphConfig( + batch_sizes=[1]) if use_cuda_graph else None + + llm_common_config = dict( + model=target_model_dir, + attn_backend=attn_backend, + disable_overlap_scheduler=disable_overlap_scheduler, + cuda_graph_config=cuda_graph_config, + max_batch_size=max_batch_size, + max_num_tokens=4096, + max_seq_len=4096, + kv_cache_config=kv_cache_config, + enable_chunked_prefill=enable_chunked_prefill, + ) + + spec_config = EagleDecodingConfig( + max_draft_len=max_draft_len, + speculative_model_dir=eagle_model_dir, + # Llama 3 does not support one model eagle. + eagle3_one_model=use_one_model, + eagle3_layers_to_capture={29}, + load_format="dummy") + + llm_spec = LLM(**llm_common_config, speculative_config=spec_config) + + tok_ids = llm_spec.tokenizer.encode("The future of AI is") + + sampling_params = SamplingParams(max_tokens=32, temperature=0) + for output in llm_spec.generate_async(tok_ids, + sampling_params, + streaming=True): + pass + + if __name__ == "__main__": unittest.main() From 81fd468feccf8f575b51ab489e09823996c04c0d Mon Sep 17 00:00:00 2001 From: Frank <3429989+FrankD412@users.noreply.github.com> Date: Fri, 22 Aug 2025 10:28:57 -0700 Subject: [PATCH 32/33] [None][fix] Correct KV cache percentage report out. (#7102) Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com> --- tensorrt_llm/bench/dataclasses/reporting.py | 30 ++++++++++++--------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/tensorrt_llm/bench/dataclasses/reporting.py b/tensorrt_llm/bench/dataclasses/reporting.py index acf7f60bcb..fd76466cd5 100755 --- a/tensorrt_llm/bench/dataclasses/reporting.py +++ b/tensorrt_llm/bench/dataclasses/reporting.py @@ -273,6 +273,22 @@ class ReportUtility: }, } + # Retrieve KV cache information. + kv_cache_config = self.kwargs.get("kv_cache_config", KvCacheConfig()) + if isinstance(kv_cache_config, KvCacheConfig): + kv_cache_dtype = kv_cache_config.dtype + kv_cache_mem_percent = kv_cache_config.free_gpu_memory_fraction + elif isinstance(kv_cache_config, dict): + kv_cache_dtype = kv_cache_config.get("dtype", "auto") + kv_cache_mem_percent = kv_cache_config.get( + "free_gpu_memory_fraction") + else: + raise ValueError( + f"Invalid kv_cache_config type: {type(kv_cache_config)}.") + + kv_cache_mem_percent = f"{kv_cache_mem_percent * 100.0:.2f}%" \ + if kv_cache_mem_percent is not None else "None" + # Engine/Backend details if self.rt_cfg.backend not in ('pytorch', '_autodeploy'): config_path = self.rt_cfg.engine_dir / "config.json" @@ -302,15 +318,6 @@ class ReportUtility: model = self.rt_cfg.model_path or self.rt_cfg.model model_config = ModelConfig.from_pretrained(model, trust_remote_code=True) - kv_cache_config = self.kwargs.get("kv_cache_config", - KvCacheConfig()) - if isinstance(kv_cache_config, KvCacheConfig): - kv_cache_dtype = kv_cache_config.dtype - elif isinstance(kv_cache_config, dict): - kv_cache_dtype = kv_cache_config.get("dtype", "auto") - else: - raise ValueError( - f"Invalid kv_cache_config type: {type(kv_cache_config)}.") validate_and_set_kv_cache_quant(model_config, kv_cache_dtype) @@ -336,8 +343,7 @@ class ReportUtility: "max_batch_size": self.rt_cfg.settings_config.max_batch_size, "max_num_tokens": self.rt_cfg.settings_config.max_num_tokens, "scheduling_policy": self.rt_cfg.settings_config.scheduler_policy, - "kv_cache_percentage": - self.rt_cfg.settings_config.kv_cache_percent * 100.0, + "kv_cache_percentage": kv_cache_mem_percent, "issue_rate": self.convert_rate_to_s(self.statistics.issue_rate_ns) } @@ -526,7 +532,7 @@ class ReportUtility: f"Max Runtime Batch Size: {world_info['max_batch_size']}\n" f"Max Runtime Tokens: {world_info['max_num_tokens']}\n" f"Scheduling Policy: {world_info['scheduling_policy']}\n" - f"KV Memory Percentage: {world_info['kv_cache_percentage']:.2f}%\n" + f"KV Memory Percentage: {world_info['kv_cache_percentage']}\n" f"Issue Rate (req/sec): {world_info['issue_rate']:.4E}\n" f"\n") From 3d54a1a52181390ca58a39f948a8753e51f98543 Mon Sep 17 00:00:00 2001 From: Grace Ho <146482179+gracehonv@users.noreply.github.com> Date: Fri, 22 Aug 2025 21:57:37 -0700 Subject: [PATCH 33/33] [None] [feat] nsys profile output kernel classifier (#7020) Signed-off-by: Grace Ho --- .../profiler/nsys_profile_tools/README.md | 174 +++++++++ .../nsys_profile_tools/gputrc2graph.py | 349 ++++++++++++++++++ .../nsys_profile_tools/images/csv.png | Bin 0 -> 135258 bytes .../nsys_profile_tools/images/html.png | Bin 0 -> 143191 bytes .../nsys_profile_tools/images/html_tbl.png | Bin 0 -> 153212 bytes .../trtllm_engine_model.json | 62 ++++ 6 files changed, 585 insertions(+) create mode 100644 tensorrt_llm/tools/profiler/nsys_profile_tools/README.md create mode 100755 tensorrt_llm/tools/profiler/nsys_profile_tools/gputrc2graph.py create mode 100644 tensorrt_llm/tools/profiler/nsys_profile_tools/images/csv.png create mode 100644 tensorrt_llm/tools/profiler/nsys_profile_tools/images/html.png create mode 100644 tensorrt_llm/tools/profiler/nsys_profile_tools/images/html_tbl.png create mode 100644 tensorrt_llm/tools/profiler/nsys_profile_tools/trtllm_engine_model.json diff --git a/tensorrt_llm/tools/profiler/nsys_profile_tools/README.md b/tensorrt_llm/tools/profiler/nsys_profile_tools/README.md new file mode 100644 index 0000000000..b7b9f084de --- /dev/null +++ b/tensorrt_llm/tools/profiler/nsys_profile_tools/README.md @@ -0,0 +1,174 @@ +# gputrc2graph.py + +This script processes NVIDIA Nsight Systems (`nsys`) GPU trace files +(`.nsys-rep`) with -t cuda tracing enabled, and generates kernel-level +summaries and visualizations of GPU and non-GPU time. It is useful for +profiling and analyzing nsys profile output. + +## Usage + +### Command-line Arguments + +- `--in_file` + **(required)** + List of input files and their metadata. Each entry should be in the format: + `,,,` + - `nsys-rep`: Path to the `.nsys-rep` file. + - `engine`: Engine name (e.g., `trtllm`). + - `model`: Model name (e.g., `llama`, `gpt-oss`, `ds`). + - `elapsed_nonprofiled_sec`: Wall-clock runtime (in seconds) without + profiling. Specify `0` to use the elapsed GPU time calculated from the nsys-rep file (this may inflate non-GPU time if actual runtime without profiling is less). Multiple entries can be provided, separated by spaces. + +- `--out_dir` + Output directory for the generated CSV and HTML files. + If not specified, results are saved in the current directory. + +- `--title` + Title for the HTML chart/visualization. + +- `--nsys_cmd` + Path to the `nsys` command. + Default: `nsys` (assumes it is in your PATH). + Use this if `nsys` is not in your system PATH. + +## Notes + +- Make sure you have pandas and plotly python packages installed. +- Make sure [nsys](https://developer.nvidia.com/nsight-systems/get-started) is +installed, and specify the path to the `nsys` command with `--nsys_cmd` if it + is not in your PATH. +- For more details on available engines and models, see the help string in + the script or run: + +```bash +python3 gputrc2graph.py --help +``` + +## Example 1: analyze a single profile + +To analyze the GPU cycles of for example, a llama-3.1-8B model with trtllm: + +1. Run the following command to collect nsys profile, for trtllm serve config. + + ```bash + nsys profile -t cuda -o nsys_res -f true --trace-fork-before-exec=true \ + --cuda-graph-trace=node --delay --duration \ + python3 -m trtllm-serve meta-llama/Llama-4-Scout-17B-16E-Instruct ... + ``` + + where: + + - DELAY: how many seconds to delay nsys from collecting profiles, needed so + that profiles aren't captured till trtllm server has come up and load + generation starts. + - DURATION: how many seconds for nsys profile to run before generating the + profile. This should be > the duration of the run. + +2. Run again, this time without collecting the profile, and get the total run + time in seconds. This value will be used by the script to calculate the + CPU(non-GPU) seconds for the analysis. + +3. Say the run elapsed time is .35 seconds, from step #2. Run script to + analyze: + + ```bash + python3 gputrc2graph.py \ + --in_file run1.nsys-rep,trtllm,llama,.35 + ``` + +The command will produce 2 files for analysis: + +- result.html: this categorizes kernel names into different categories in a + stacked bar chart. +- result.csv: shows how the kernel names are mapped to the different + categories. + +### HTML visualization with result.html + +The html file shows the number of elapsed seconds due to different GPU +Substages or categories, which consist of moe_gemm as the biggest +category, at .14 seconds, followed by "attn" kernels. This lets the user +prioritize the kernels to focus on for performance optimizations. + +![Example GPU Trace Visualization](images/html.png) + +There's also an appended data table underneath the bar chart for copying out to + other post-processing tools. + +![Example GPU Trace Visualization Table](images/html_tbl.png) + +### Kernel to category mapping with result.csv + +Suppose the user would like to focus on improving decreasing calls to nccl +kernels. The next step is to use the result.csv to dive into what the kernels +are which compose the nccl GPU cycles. The following image shows that +ar_fusion all reduce kernel to be the biggest contributor to GPU cycles for +nccl, followed by AllGather. + +![Example GPU Trace csv](images/csv.png) + +## Example 2: analyze multiple profiles + +Suppose the user has multiple nsys trace files, captured for different models, +say llama and gpt-oss in this case, and wish to compare their GPU/non-GPU +time, something like the following command can be used. + +```bash +python3 gputrc2graph.py \ +--in_file run1.nsys-rep,trtllm,llama,100 run2.nsys-rep,trtllm,gpt-oss,102 \ +--out_dir results +``` + +The analysis process is similar to example 1 but now there will be multiple +stack bar charts that can be compared. The categories for the different +kernels will remain the same, so that it's easy to compare the GPU cycles for +the same categories. + +Once a category is shown to have more cycles for one configuration than +another, the next step would be to use the csv file to see what kernels are +mapped into that category, and which kernels are taking the largest amount of +time which would cause a difference for the overall category. + +## Example 3: add new classification for a new model + +To create a new engine DEF with model ABC, just add another json file in the +same directory as gputrc2graph.py with the same format as the other json files. +The script will automatically pick up all the json files in the same directory +as engine/model specifications. + +Then, for this new model, suppose there are 4 kernels to be classified into +"gemm" and "attn", where the gemm kernelshave names with "*H*" or "*I*" in +them, and attn kernels have names with "*J*" or "*K*" in them, just add another + .json file in the same directory as gputrc2graph.py with the same format as + the other json files, like the following: + +```json +{ + "DEF": { + "ABC": { + "H|I": "gemm", + "J|K": "attn", + "CUDA mem": "non-gpu-H_D_memops", + ".*": "misc" + } + } +} +``` + +Each entry in the dictionary consists of: + +- key: a regex used to classify the kernels +- value: the category to classify the kernels into. + +The last 2 entries are common for all engine/models, consisting of CUDA memory +operations and a 'misc' for anything that's leftover and can't be classified. + +When invoking gputrc2graph.py, specify a trace file with this new model/engine +like the following: + +```bash +--in_file new.nsys-rep,DEF,ABC, +``` + +If the engine_DEF.json file already exists, just add the model as a new node in + the existing engine file, after the other models. diff --git a/tensorrt_llm/tools/profiler/nsys_profile_tools/gputrc2graph.py b/tensorrt_llm/tools/profiler/nsys_profile_tools/gputrc2graph.py new file mode 100755 index 0000000000..1ca8a0ff23 --- /dev/null +++ b/tensorrt_llm/tools/profiler/nsys_profile_tools/gputrc2graph.py @@ -0,0 +1,349 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + This generates gpu kernel analysis output from nsys rep. Will call nsys + stats -r cuda_gpu_trace, get non-overlapped gpu cycles, then generate + csv and html output for analysis +""" + +import argparse +import logging +import os + +import regex as re + +logger = logging.getLogger(__name__) + + +# helper data class for annotating kernels +def load_engine_model(): + """returns engine_model built from all json files in the current dir""" + import glob + import json + + engine_model = {} + + json_files = glob.glob( + os.path.join(os.path.dirname(__file__) or ".", "*.json")) + for fname in json_files: + with open(fname, encoding="utf-8") as f: + engine_model.update(json.load(f)) + return engine_model + + +class GPUTrace2Graph: + """ + Parses output of nsys report, generates csv and bar chart output + """ + + def __init__(self): + import pandas as pd # avoid importing till needed + + self.pd = pd + self.pd.options.mode.copy_on_write = True + + # helper functions for generating trace->summary csvs + def gen_nonoverlapped_sum_from_gputrace(self, in_file, out_file): + logger.info("loading %s", in_file) + df = self.pd.read_csv(in_file, + usecols=["Start (ns)", "Duration (ns)", "Name"]) + if df.empty: + return + df["End (ns)"] = df["Start (ns)"] + df["Duration (ns)"] + df = self.sum_non_overlapping_intervals(df) + # get ready to print table with elapsed times per kernel + df["Instances"] = 1 + df_sum = df.groupby("Name", as_index=False).agg({ + "Elapsed Time (ns)": "sum", + "Duration (ns)": "sum", + "Instances": "size" + }) + + # generate csv + df_sum["Total Time (sec)"] = df_sum["Duration (ns)"] / 1e9 + df_sum["Elapsed Time (sec)"] = df_sum["Elapsed Time (ns)"] / 1e9 + df_sum = df_sum.sort_values(by="Elapsed Time (sec)", ascending=False) + df_sum[["Elapsed Time (sec)", "Total Time (sec)", "Instances", + "Name"]].to_csv(out_file, index=False) + + def sum_non_overlapping_intervals(self, df): + """ + returns new sorted df with Elapsed Time (ns) column using + vectorized operations + """ + logger.info("sorting %s trace records by start time", str(df.shape)) + assert not df.empty, 'empty nsys records' + # Sort by start time and reset index + df = df.sort_values(by="Start (ns)").reset_index(drop=True) + + # Initialize elapsed time as duration + df["Elapsed Time (ns)"] = df["Duration (ns)"] + + # Get numpy arrays for faster operations + starts = df["Start (ns)"].values + ends = df["End (ns)"].values + + # Keep track of current interval end + current_end = ends[0] + display_units = max(1, int(len(df) / 100)) + # Update current_end for overlapping intervals + for i in range(1, len(df)): + if i % display_units == 0: + print(f"processing trace: {int(i/len(df) * 100)} %", end="\r") + if starts[i] <= current_end: + if ends[i] > current_end: + # Partial overlap + df.iloc[i, df.columns.get_loc("Elapsed Time (ns)")] = ( + ends[i] - current_end) + current_end = ends[i] + else: + # Complete overlap + df.iloc[i, df.columns.get_loc("Elapsed Time (ns)")] = 0 + else: + # No overlap + current_end = ends[i] + + return df + + # functions for generating html files + def make_html(self, df, output_dir, title): + """make html graph from df""" + import plotly.express as px + + if df.empty: + return + output_name = os.path.join(output_dir, "result") + if not title: + title = "Model_Engine" + x = "Model_Engine" + y = "Elapsed Time (sec)" + color = "Category" + """ generate kernel mapping table """ + # Sort Model_Engine categories by last field after underscore + df["Model_Engine"] = self.pd.Categorical( + df["Model_Engine"], + sorted(df["Model_Engine"].unique(), key=lambda x: x.split("_")[-1]), + ) + df[["Model_Engine", color, "Instances", "Name", + y]].sort_values(by=color).to_csv(f"{output_name}.csv", index=False) + graph = px.histogram( + df.round(2), + x=x, + y=y, + title=(f"{y} for {title}"), + color=color, + text_auto=True, + ) + # wrap x axis labels + graph.update_xaxes(automargin=True) + graph.write_html(f"{output_name}.html") + """ + Generate data table with columns per Model_Engine into result.html + """ + pivot_df = df.pivot_table( + values="Elapsed Time (sec)", + index="Category", + columns="Model_Engine", + aggfunc="sum", + observed=False, + ).round(2) + # Add sum row at bottom + pivot_df.loc["total_elapsed_sec"] = pivot_df.sum() + pivot_df.fillna("").to_html("temp.html") + with ( + open(f"{output_name}.html", "a", encoding="utf-8") as outfile, + open("temp.html", encoding="utf-8") as infile, + ): + outfile.write(infile.read()) + os.remove("temp.html") + + print(f"Finished generating: \n" + f" {output_name}.html for stack bar chart \n" + f" {output_name}.csv for Kernel-Category mapping") + + def anno_gpu_kernname(self, df, mapping): + """add "Category" column""" + + def anno_gpu_kernname_helper(name): + for kern_name, val in mapping.items(): + if re.search(kern_name, name): + return val + + df["Category"] = df["Name"].apply(anno_gpu_kernname_helper) + + def make_nongpu_row(self, df, nongpu_sec): + """this will append non-gpu time entry at end of df""" + nongpu_row = self.pd.DataFrame([df.iloc[-1]]) + nongpu_row["Category"] = nongpu_row["Name"] = "CPU(non-GPU)" + nongpu_row["Instances"] = 1 + nongpu_row["Elapsed Time (sec)"] = nongpu_sec + return nongpu_row + + def is_valid_file(self, base_file): + """asserts if base_file is non-existent or is empty""" + assert (os.path.isfile(base_file) and os.path.getsize(base_file) + > 0), f"{base_file} doesn't exist or is empty" + + def should_gen_file(self, new_file, base_file): + """figure out if new file should be generated from base_file""" + self.is_valid_file(base_file) + if (os.path.exists(new_file) + and (os.path.getmtime(new_file) > os.path.getmtime(base_file)) + and (os.path.getsize(base_file) > 0)): + logger.info("reusing %s", new_file) + return False + else: + logger.info("generating %s", new_file) + return True + + def gen_sum_file(self, file, nsys_cmd): + """ + generates sum file from nsys trace with times per kernel and + returns the name of the sum file + """ + import subprocess # nosec B404 + + file_dir = os.path.dirname(file) + file_name = os.path.basename(file) + + if not file_dir: + file_dir = "." + # Walk through trace and get the total non-overlapped time + nsys_stats_file = os.path.join(file_dir, + f"{file_name}_cuda_gpu_trace.csv") + sum_file = os.path.join(file_dir, + f"{file_name}_cuda_gpu_kernel_tracesum.csv") + if self.should_gen_file(nsys_stats_file, file): + cmd = [ + nsys_cmd, + "stats", + "-r", + "cuda_gpu_trace", + file, + "-o", + f"{file_dir}/{file_name}", + ] + cmd_str = " ".join(cmd) + logger.info("+ %s", cmd_str) + # estimate time based on calibrated 240M/min + file_size_mb = os.path.getsize(file) / 1e6 + logger.info( + "nsys stats for %.2f MB file expected to take %.2f min", + file_size_mb, + file_size_mb / 240, + ) + try: + subprocess.run(cmd) + except Exception: + logger.error("%s failed; Use --nsys_cmd to specify nsys path", + cmd_str) + exit(1) + logger.info("generating non-overalapped sum %s", sum_file) + self.gen_nonoverlapped_sum_from_gputrace(nsys_stats_file, sum_file) + self.is_valid_file(sum_file) + logger.info("Finished generating %s", sum_file) + return sum_file + + def gen_graph(self, in_file, out_dir, title, nsys_cmd, engine_model): + """generates graph and csv file from in_file into out_dir""" + # Initialize an empty DataFrame to store combined data + combined_df = self.pd.DataFrame() + for idx, (file, engine, model, total_sec) in enumerate(in_file): + file_dir = os.path.dirname(file) + file_name = os.path.basename(file) + if not file_dir: + file_dir = "." + sum_file = self.gen_sum_file(file, nsys_cmd) + # read kernel summary file + df = self.pd.read_csv(sum_file) + # annotate kernel to their categories + assert engine_model.get(engine), f"engine {engine} unknown" + assert engine_model[engine].get(model), f"model {model} unknown" + # remove nsys-rep from file_name for shorter x-label + file_name = file_name.replace(".nsys-rep", "") + df["Model_Engine"] = f"{model}_{engine}_{file_name}_{idx}" + self.anno_gpu_kernname(df, engine_model[engine][model]) + # patch in non-gpu time + gpu_sec = round(df["Elapsed Time (sec)"].sum(), 1) + total_sec = round(float(total_sec), 1) + if total_sec < gpu_sec: + logger.warning( + "Elapsed sec %.2f < GPU sec %.2f resetting Elapsed sec ", + total_sec, + gpu_sec, + ) + total_sec = gpu_sec + nongpu_row = self.make_nongpu_row(df, total_sec - gpu_sec) + df = self.pd.concat([df, nongpu_row], ignore_index=True) + combined_df = self.pd.concat([combined_df, df], ignore_index=True) + if out_dir is None: + out_dir = "." + else: + os.makedirs(out_dir, exist_ok=True) + # generate html file + self.make_html(combined_df, out_dir, title) + + +def parse_tuple(s): + return tuple(s.split(",")) + + +def main(): + logging.basicConfig(format=("%(asctime)s - %(levelname)s - %(message)s"), + level=logging.INFO) + parser = argparse.ArgumentParser( + description=( + "Process nsys rep and generate kernel non-overlapped cycles. \n" + "Example:\n" + "gputrc2graph.py --in_file d1.nsys-rep,trtllm,llama,100 \n" + "d2.nsys-rep,trtllm,gpt-oss,102 " + '--out_dir results/ --title "Model=gpt-oss TRTLLM chart"'), + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + # load supported engine_model + engine_model_supported = load_engine_model() + # Get a string representation of supported engine/model combinations + engine_model_supported_str = ", ".join( + f"{engine}:[{', '.join(models.keys())}]" + for engine, models in engine_model_supported.items()) + parser.add_argument( + "--in_file", + type=parse_tuple, + nargs="+", + help=("list of (nsys-rep, engine, model, elapsed_nonprofiled_sec) " + "separated by space. Elapsed_nonprofiled_sec is runtime without " + "profiling used to calculate non-gpu time. Specify 0 to use " + "elapsed time from nsys-rep but that might inflate non-gpu time. " + f"Available engine:[model] are: {engine_model_supported_str} " + f"Example: --in_file d1.nsys-rep,sglan,llama,100 " + "d2.nsys-rep,trtllm,gpt-oss,102"), + required=True, + ) + parser.add_argument("--out_dir", help=("output dir for result.csv/html")) + parser.add_argument("--title", help=("title for html chart")) + parser.add_argument( + "--nsys_cmd", + help=("nsys cmd, e.g. /usr/bin/nsys, Default: nsys"), + default="nsys", + ) + args = parser.parse_args() + gputrace = GPUTrace2Graph() + gputrace.gen_graph(args.in_file, args.out_dir, args.title, args.nsys_cmd, + engine_model_supported) + + +if __name__ == "__main__": + main() diff --git a/tensorrt_llm/tools/profiler/nsys_profile_tools/images/csv.png b/tensorrt_llm/tools/profiler/nsys_profile_tools/images/csv.png new file mode 100644 index 0000000000000000000000000000000000000000..3fd412f657772e98aaf6c0b9781faef414a865c2 GIT binary patch literal 135258 zcmeFYcTiL9w?9e;K?S5qZ%PpX>Ai`7bQGl%KzfbRN$9AEh=53M0hHc*3sI5Yiuhlh7V^YNpn zcz9&Acz6V_$w+`J@R~C&JiP0A&Z?>gnyRWC2A*ICXE%F1yvI?A#-t{OJ@i@TPc_M? z$ken}b*bWSDQl4lh8}Re(|Szs`j+iOgQ~n}@^Z^ZmANGxLe0$t)l-!M(J%Z2qlps} zeO_5^_NG%QCkHJ-_k+Av5SJMzD;ZvnTW~_WJVTCzr5rQ7MQw`3S2*Uxq`O@UgL3yP&vM%{So*sga{8z z-MdJPM`a(^knx1W@Q)AGMsH38)nwtzB&LSKmv5P@*eW&OEvTH|#uL@$h;zh;O;ZH; z<~*QSe_&nvP3_*N$DB7qr-YS4(#YnCsZc7+(!8o)hkA~+csIaljO}RzDiaTkx*wN5;+Ot9@Vl{4S$|Ww@UiFj^?{wl4Z)WINBW= z4$06)^-{8r?6opN+4NH4+c7#av2xu?G~J|u25TWk%RHXHPE+*iQ7*`c+x_Xb&ULQ0)T-~WjnrF_XLIp8On4;%gWutA=Z|0L`#)Ue zkiTQW>wU{ZKasXi!(Rs9`{700_j9|zO!nU-F3k&Jjl$0&d!;F)2=p46Ga{qU+EZsZ z`<$&4{amMIMRq1CzC)Kk;Q7eAc>ar=d&)je;v0bie2KBUTr7f%qgppKh? z##BZq29J>nykoYF?d{7o3Wnh8q3rU11pe$W41Oe35@f>5kh}k^m*N*oc{9~qdIY{- zHs=H4q;EGn7s%y9Dar7@bFe&$Ac`Z%SKhAqBbyw-XnoU!OiP*3gm@xCtCDULPht-G z5>Gsw{v|;MKErqWm#n|TN(w0CN#S!6$Dtf#*K)XZeo~33mQri{r2E4W{P@AGsGpqt zk7Yl-c}#s*l+^U~XHM5REjziV(dVHo>Utb2@i!MG_wd%#agS#6ZfmLfJfRsUVU6%K z5Mm1BfhCOG_m(w#Eu?M?qjQpIW^h;5hcS$7IPspcVA}EW!>S&5^ zBS|UH(OczG%I5lZ=;Yo+)S>nEORmA6(~UG1-);Q4gnkw*?C&i3-vvcT%@ypF922%O zTak}5Z+-2pq%i*7d#A-88P;{@l_pn_t8Yg`PC?u@@qrnHLL7gr%ukJ}!ez80s5t zKecjpax!u9cmC<}Se)MJnbYQ^biuCj0;_K1vhu>nsFN0w9O;34g>**>$Yy$|Qo;&G zDOLqm>kbkQ&JG&VbaWkc+wvcKwK*g@L@d({FkB~ZPy24KV?!&_%Sg(Yd|SbcR*+dR zD@oZk%P4WerYzko2&sdqLXIF$kbQMWPT7;e`KqH;BiAOV*X~bD=XgC`p{B7l6AjSe z^rH6S-50OXPA+sU>6;uF5gfY!HCQ$n>uU_@3TuM%L5Z`}a6S6YYMWitfYQWL!cyka zqf?$!i=Zl;=mq3L;1qmaossDFyBk_Ju#6T`{@`xO>yrHv7Vc9{{zGI^o)6wjq}kMV ztkymm;+uJ*nD;tQOM~_HVYGA4mx5=Vo}qwHMS0js>3M@U$&O;p!YMb9&T(7<>F|MSr6Rz9}`?7xl=E)I~25w z?{8DIOiWKOmkyaBs;cTV>rG#bIqmmPd`kkG+nGB; zCqH_ufTd>2ky~jBBU!fDu83(|HR1Z)RvgA)>17YK&%w*j(jLB>w{>r0B+O)+AqTp= zX*82X!$pH@VPbtUqfJvyBPSiVOGJunjbdfwmTRJ0w6ps3lI3iDI^V_$vZ3U$`o^Tj zIj)Htb*~OyaV{*!x-uP&4vgq-)^}i%)+&u$$V9qF@Wv3bTqVmrc#_jgS7$4}}K>Jd)C2gp^ZNWI9IZSecMot;08IuxUrnWcEQ+~mcA zUCy(PC$EKF6QJ3;gS>l>#ysJ+rn6?9Ciqjtr^!2hdMskEwthzQtMh;Qod4{p%hUDp z@ixux2^0G3T3-4Bnk^#JzI**WD+RZ8d(u7a?%5VwuYdP`2=P44I*1K-QmKOpM`Ty= z=NWg?cu*-a5WkyvNG#107!ZN=~0N&J*7VksM=Dc%1SzpSONE0!tgWfs~0#E5GnV@o0E;s6xzIxv~wiH?yt-Rcka z3(2pUd(2A_4-r3#e>8}Fa$w0QQZSUQaZhS5wXeyde?uE2#hfjeF(ca*IQT<`a@ljm zBRjIcFJ)2|?TMQ|v7~s&Fn<4wP`~A|SC#f|+B|~(uCNQpt=j+lyaxJpu%U6TDVK0l z(@)2qiLb{`dq%PL3rX%4*rsRCAUXYCOItS=Jpb%bpg&&J&!cd3sRBr2Na^qD_l;W3 z>zyu_za(61Cuz$9YJGno-}-F&eBHpADfwKScsA^nF&8p_W|kG;5=fIYbGNA5wYB$M zIxPNH9F+b+N)UD6P~)_pD2Q*q-jL=dh&l~Uw?ma-a!}00?!|q6RDQHu{?%g6E7RN3 z=wDl+$mjJNV;*Z;gK3RvhILtWb zATHu??JSz|k>=?^X_Oa>=VM#+ z`$@&z5Dg=?uWXiqwU;I%Rc#B^3ktl|dh#xrNXW|K&xA?6jw}}?)>o&$Q{%*VhdhVa z>3fk8S7cxXof5^ zT_QU+o=nPl+?j3k5yLpF2#%CFHtc&K&cE&?Q~J^WzWZAcUf@%4-nM(OZiNY%nOqwQ zSX3E~sE^#G+?Kzk2*H``9@|dN^B(Xn=?`R0D0&4=jIxdfoiu17d=!!uE{6^GTCm&c zpegwAZfvL$62go1g?~nDUV5Dfv>UfvdY+Zk3FmL)HqIy6XFs<|1e))E+~3+fTRhTN z>OX#bYO>pdGu&T!H{^G=e|Bb1&}2Plj7M0Ur^|yEvPBY_MSw?>#axsyzS15RUN}@s zNOsO!s;GLCmJ)CMk5Z5u<#|ifwQsmgyeZ`%btXI!Z@gQrc(NY9U$V;f-N=x8{)TLb z1eL33zvvhmVmR(NT5--cO^{g3e0v&45pg?$x~)5~3XI#=iuOw_q216efmiK@8Q5N1uiFV@0YT| z!hU{!LVjXGU{6P35g8d7;Rm9^qN0L84?(X04{sZPK@TtPe@*hwJdf(tER^Z~gTIjz%|9Vb)f9L;R$;0dK*8*Np`09?Zh|mM!|C$>ZDt~oV*1*}{ z-p%xpvpYaDU=2k{adG*-`u{)f{P&9gW2DJ{j}(0%Dg8f({*PP#_o2_d>^)V%?!cPf zivPV|e-HkjH~$_eFMRdt|AQ3&qVr!@0YWR1%M1V4rYVw-?zIDPE*^{XBmHN<5#X|` z5B@`-;rrL|>U>1<+~Jwtpt=h5iUP(Evw4y;*KDBbx!xn$g>yqCw+;-{9+x*nb< zG3-rt8;f$&{hM1gVs~+;8x@#`5ApDcsFeS06G(8EjBm$z=hlDJyA%(fIFR~3n3hVJ zBOB;@@;D_FkARHh-!??FJFNfyVS$Ci0ttv{7rAW+{*zhBs0_aO{aX_J^*Vdks0`v_ zUtjwVo|TM4Ss5SwAJL1dn^##`Me!ETzo)>}!T@ne|ChM`OI$!?{{O|f%2BaOCw6kz z|0C%M4YdjlB|wl=Q2!?q|Bd{?VpyKlYvH!@(dA9p`Y+Z2&guU^x$zAGibC3~`Kozf zNAYjPwVN8BfSg6p>8iP1{pR0lt;KhMxPGJ;ziO`k2M^@=2zY4?inOcdtsCdR)rc}& z%F1DR#JX3FYW=^}3SlhN9I$UA;XGFjr}@9#2<49ei_JNy&;ZT%IKh8wDmXNV$oO8? z!ma_0bvOOL)xL4uf02Lqf64!U-0A-%|NlFO{xA3cAKB&~t@OXj=O0S^A04az@5)D- zXJ;wd>djIi8NtZ;D0Pw{B81IR;Bwh9;Qx5vxc3Bim&`H z#aI2`itnXPC=FK^`RmQAXEo(gzKFj|tR`9WFm!=m;C+-1*S~gJ`6k6MsPE5@l|1fH zWe#8tq93=<)yeqCWra6NUJG03Pu5 zHG`1XuXaenJ`+i2J)O6;|3|5FUz|URjZXD?MH;?Q8coG@O;b^VW>zsR`3V@f=E7BCi1o#PmYsJfp6M6eJHQ1p`p zmKn|ImHfwyhs_Z=RS-M&BvI{9oMTZA{59H)(?#CSmv$g-DxRGMeWf13h8ZEc{);+o z;U*L8tZPN($r?meS*6*|@V5{?F+Tx)<>{)3?b1NQY_+z9=dPOP&p(k2mEJFEmWBP* z^7YakjAh|oj?UA$0WX-la4H4@o7EoqG7YTPssCf7;1aJW6^}5LeNTR zfma^v*x-s%m(2+DEL9s54I2H-FM6EM`Lx_ZCSbmb+Vgi+SH8WHkehd3lAz~IOa$hr z&OKeyQ!BFHti?J`i;ay5_fG!7CTB~OfDl4QNKAz|^)V5x-9wqEphkLwoq6p9J7nM3 z*GJbxE|cRT*#njDJu9R-;x#{9PS3US)I~K!TgdEq%GB#&8Yd||ziSJz^9k!`+epc5 zrHYI;KFRwgdAYg}+ozzq-GE{aY(MmzZ3-el)q6b7`>1%aS<~2Hx?h;Lh%6!PfCuu> z|E{!&U@=C7o$n7R`TD7Dw`mW`?tjqPObg;LC=8YSUEa99Ub)29xDAql8x~YL3&DHp zL&NC=jC=WLrt0k0o=#UU9Z@X}%5C4*O&3q{YW7QB9!PtezfS--W8EYG8_j8J4k>}M z6lXk+uhSGl&AJRD(AUhpp8}F`{9>0~-!VJD!Gd8EWLf zbuWj8>p5oJ1tVxeY+;%U>U+PZL{b-HT&>Dbb;gd9iw`3j#^<*w4eiN8V)*Lzhd7Y|bmhCNfWju7bJ1R1Xt z&6$Odi7vm2?IA~-lwl8vC(6vh$wg*$S~-36hXm7920LBsA&)276xs=Li;ar7Vwfa8 z-#gpn&JMyhl?ESKnl>m7S?ZeCx!FM40}OhT(Gmvp$vpZ=0;HC<4Mp%gCt?0OfFkDL z1Na?wN9~+p+=7Dd*keZ4dFo5*MJ2@0cWwU7_n?>cw6_2{KlyV0BSe0!^JAG|*L4Lc z6)k|pyii26KflE!mxb{Qcl;k3tKZ0nADG13NG8xbS)8fYSBQNFXlZji0g# zQU3mOrAr&$>_daX(*`V86;1csP=X3(0ZOY#_{b_Mv%e=+DMsQ+W5diV$3p7aB3lhc z?6)AC+S6cNWtHC7ZU8*!Rhh3$Jk zM>!hbk5HZ9Zf*}Gbw<7CA0`=~WoK-K|Q zc6HjhVDuF$x`>E&6>Olnf5%D`k$^l*RcG1;OcHT$(hmoZ*;Z5UV=~MOxx`xMF&=*7 z@{AY~J$t#*Xgq>xfjP)Uyq0pQWlOT+iR$u|&HulesS zYNbmKXbZP-__(ky=#rnM>nMk@skUslL4T~c;#ggo)rL-9utq{W3f!yhKJGq4dgA2w znEnp4tMxh3k*=xad}O(p{3AU8kral;@Q;AsfY76deiEvlchhb)InubJAVInnBPYxhN?e54VeM{`ph?m@U95C1do@qEkl#*sOJB{lw^gJ`EWkUp{gAW>+u3Vi=E* zr9Jzn-+w#BCMF4)^C&=;5JeyZ?;pO))~Xj@<*`3$s$G7iR|oXbLq{}()h%5+;ZGccdi_V{ zjWKCQlyIrdhtDtZ8M+;_Lu^*y(l)Knil2$pMtVVx;6qQ?slGBeT3{?( zWFk_>>^Xw%5hoHAoIAZX0{8=Bh!}kndR8Hg*o_~0w9URs8cB%Vx0bvUn&Yh}6r0-z zyxHfIHxprq5mNTRxbea#?QH0}*djV3+6XWkSA}4RWw4lP`T5R5tZ&wYAauJfIVh;` zBq(HlgxY3fl{qFOXuhZbvps#O6PLzQW^9%{rFD8Fr+-sv^MRZoQc1S8ulq@!J>cLx zY?s+YTn2kCuNCx#;x6nij?miMUPCn{6SAkPDR2=dHuV@b?uz?n_aO;72_zb8j+?V} zB{xPZX}EDCyKu+BH1~p8-##jt#kdIjV{UG{HP?!1^cej>dQVEcK594qu)!ihvB-a& zpLKJiJ0$q{52>8@8@zj+s{-k8`Q5_?GgQ9RjDG}+=g#*huMRuE)YCDfp(iO7&kZEqrubUYtV`N& zD(UDwu+#zh_H;b0cbgt=X8N3uc?$Qh!vY|^H^$}Q8{thdK`~q8=Rk(BTV*v5nHSV1 z$n3^;7c~@iz7g7C50dlh2j;Kw4cq>49-A<&^{)!tLtplG#vqWk>JcijitBt{*DY`P zmakyss$)EL%FH|wjO$u0iodL%DcodE&#CfgI^ijL@wR953nr&!i_zunFa(vgVq8=m z3zvMb`abMwnVCe3PHo6j=RP$Q+t}Yw6NemebrGC?!_35H@6DkB)iYGt_iKlky7Kru zjlLDDKz!r2Vo3XNIYv5L#?AoN%w&ti43xkDsRDYUS*GKBHE+WF~aHR^32oro_ z0K5F$^!O#XR(IMN(!Scm%SLe~awvR$$}I3jV+fSPYjd%GP=x(MsldlnxwCh`=sVHq z*qK5>7jqsx&HMN)GbF6(RSln8^W^|zrd-tQomic6Qd*r%Y_)}NN1`MO3H^aP!|+O> zA?Ax+mD>`P*4VzWTwI}Wun(Y>Dzb0XzO!h0R`f}9{!gTRXMd{b0-D)0-tx3|<7IK+ z@rZ`+^BY=47EPW`V!Gz_pXyxqg@)Iq4VbK9VMHUJNTMtP{}7Kj^epJh^ZU)niLc9N z7BVz}Ab(E2WQl)ldg4G!fgj_aA?;j!ao;g(w&@#;pFy#5gZI&ZzPx6u{H)gst0YIs z11b1U3F23pcxZP>1T^}Zbu+Fr-LALRSwt$2x|34jC8E}SOtw}Bjaxyz>V8PW1sWG; zfIGIyOSl!2fH#HXE%)BQ1jwpZadfW4NfYcj6M%9b-TG{Z**ra5 ze@|kf#xoFsor0-6sTzIbaQx1d#4-_#m6$|xK*W^C7P#5spV)SG_l6LXdYgIC$V>W; zKMr^`ubl(2B&<_cV1>6-Id;V)GlQE=jT1dA2a{U18fV&;f!Mp728JWUrW*iO;E0E# zBsh=9H-uMFq?mrZs2CaZAzH-Aj3hPm0 z`tm}C0}?XZ%^`~`6J7;H2|VgIOfqGl^%g+`K6~kFtQ=;eXHpE750t$N$Gc~XYbS^7 zwx?Iy#Q823KV~kHdI>QK9?BO)jz2h}crOZ@n?P;H#V?0(=_17%Nu2zPq-(P$E3AZh z#XVex+gY5%&1&*>F=Jn% zlkdm>TITnP(HUh{VSNGtgz)Llvj%jKb}3eDxgs`rD`?Nia0z~tlS7%|0Hd=tRn?Wy<>KOnbyR3+q!RhN)8vWhhOmiy&Dq&V~*c z#V{R$7j3JCwPpIUBOjj}98RLA3LK3PXIYljPukyN35*w|w{Y&evJFHBcB?-nF^{s{ z#-A0vP&0ktS3jmgJUalX_)=_Ku?)Yzhfumak7h9r={mMH1f}tHO%!E5ZDSLIv;KC6 zC=7o=D7mBRca_>m9fa0xN%ze=XR-Go1|T1i1Zt#hGc?*B?Xc?u1-fYZ1QhtK+( zUo7_)@&+@_EzJa`{0n(1-%5LpN1zTwsK*s5Dh%W2_{66zM%9lrN6^AIK95qESfxpp z<4Bv4Th+$F<8C`TU8I`jv7*EZS-!C5zl4mBJFFZ|)PTG)t7DSINFJb2M9{ zz@3ko{g>mlfkUeZ4S4yGB?ivX&>(NJRt5e+#%jp%JM)Q(nEvC^Dlbp zHSlNBv3ofyaNIK!Wt9rAv$XojTP^QNjO)eSX%#}u-9IL%Mbk$@b_-n~xBY`mXFX@J z7hP!TNOp%oZA21{d7U_1nALXZ*I~vGMZSDauma5oT$N0)cCRLHmJ>Ac^f*EP(a=e& zuR-muv7vikK){ml-$CCZ0}P&ozVnM8h^S==$irP)oWZFOcgvynGzn*y@)lN(KC6%! zY;Y-D-gT5T@ZdXrh(YMAj>qtqRCrSUiZ+upY_bP%!@>Ix2FV%EHobB{GF?h?6GX42 zu5;@yWyL5SjD0ny4A?Jg*p&3Sgy?FcQrSb!qwb5q-8Sl^f{rzyvJ=l}6=;7l8^o1i zR(cEzkg(uG_sRSidE6@Zsi@Unw!lwlc@~)4R8eIcPCst0!G%xUys13K?M_wnI$+%c zO2zzw`8#B zVAXy@?=Z->pU+ElLb~=H0>7LFie2?+xj7-PmLDwy??1RN)~6SI5Ffd;=!^M5KmCQ* zsB5xDC7em^9Qmok#z%GPN2>}4DRH4vxbQ)^P~iz7x%Uu6MJ?L^+klcFGlF)E&N=Af zV3O{X6AC#w?iDWPCj(MRs8FI7*onsE+Kad75zZ;`flLYR(wVZb{u=`w4|JJc!{9Dz z`5fz|-R{f1(2oUk+Q<-jXXWoQGcD7zuv4p-IWwkj$e}6QCpyffYZy4->cex;`9UCN zedx6jW!1|#BV>yq{B6ube2Qp>{$xOV1*F%v3kJaXoadJK#*9WeJcZXDJfG3k*Jrv@ zTr{5r?R`WumDlbVpb4cN_s!Zt6L}p!uc|)R*LfS!|g0T_Q%PZhk#VG$OZn)^YvTm zO-yu7;C+*ccIG&BUJm}zH3XDTWk3#hb3o+ywdZ`r5otXt-XQiAHow_O3E%KA@8naZbYf0wz zn_@%j_s3ZMaOqL9mJzqs(oJ_{j8Ng85b%R=W^gEnQuS5#~hfjv`F5HPRK3l0Hc@Oa;h>2-ePuir;L+iWF z@z+6HV8p&qfXgD^ndDw;Kf>yUXUJSz&a6ug0;h~M!4xfQNL9{Eu4^Ifi%Aq*wCV&M zrG6;=oqVA(48@x1q z1#l~8VH413RuP=jAy1)uUXTF;E3oNJyz7AlGyH{b-_Mh#8?y zNY521wj)_;iT-WpG48bI=US-1;zXOugSn~E(h-$(kQ+(D=0B5tN=SaLWQ2TksI{M( zdlv(#>Kv)tq{%sby_xnaKwoIBdYaY%rXRXWoaaj^*T(v0D`~ZLhHiuWR%QdY zD-asd(_>ai+_9y92Ij#)Lek#6^366vvus9pC_CYZD6kTG4UN3Q_%u%Z^j05p)jm01!kpfFjhf&X#4USrPq{ z088HuPvk;4wiTucZdJoxw{hH|B#^nUd_f)f=S;Yej@H1S-p%x6ZpJT{B=Qt0&FsBI z=ehfa;kD>GxCFa8@mxrpVWBlWKIhGg)}3rAT3Q1t+r#0-AM2&V;WF58AnQbM#I6rW zkl=E-GwNnRx0%>tXg+5I__HD$Iz0(6^%ODrNf$d&`S(O7+PS+S%y&hnzL~2-^;6`) zZ9yd~WjMC$t@h^(sOA0?A=!iQ{AVb@2Rs2_*T0x~1jtpX`Dxl0BjEHyzPGONy>o=_ z*3Wo14xeqodDnRkp-)f;9!1WV7uW>*3L$dAqkgXiyPW~m&9fkE(S5%ZP)(YIEB)c0 z)M2gVHGMt`;I(wnVt9SkA`~pU+F1z(9~f? zMFsRoL0BB?R*g@*4Zm=hin}qZcs?U8;WDh;sdV%q^DFwRYOFMnustKHnT17TYd;8W z3b%`JXG<=8lG>LXLbEUocUCZnl*BBO)7_I6oZkcf4G6z%W6b?hb2#QPtSG5k_z2po z$Y_mGQ|odwzu0P#e1gAs8Jrp2sA?+jS2NJGZwY~q{54aKq07D{x8)UxBmxohSZeo@#r^vl5Y-L6fOE?f|_!iS}i`(r8N z=?oN2)P!*$AP_$hp3q>{>PEdi^6M^Z=vl1_KosC8j*#xJFbnHQK6lK7|HOgw?7ed} zJp63D!$?Z*&}1BiOFTv|gp!!4_Um`^kAp25?qjq~H$sxrWvx&jWZleW7GjfW zbo@ej>)rTUHmjotdy~%@>HvuGas6O#!vag%w_RE{o3mKn5aqBfH57M2c0iK5_G+A; z?1u!B@J_&f>9?%aMf(|5BuTg`R^l(o;x_Yjkvx6O#rj820PLlkmpvH>c1AmXd{TnV z-4**C#+Dvmq){L6M~;u51KRbN1P;Uo?vV zOqrP-w3z1Gez|2{@*=7G^l-!7@4Puwy}HcbJ=JmoojG5o>X8ImStMqCgc`Otw%~`@ zS|%vLtOtJwMHvm|5F_m1tP6bNk8G(R3Lgqmh+8sd(gxlxf|8;1ZskF04{SWWuCb_ zkmc+#!IE@qn#U{tbT7>K{l|236ChZD>7FIDcj^kbDaoRlgVnMRzS=DE=Cgy zOu}@tgK$l&$8Bj`fEk);I*Bx_q|i7Y`x2ln_cKlpRJ}?kvlB|Vc?%G06JBPV?g^pt z!Q$Z{YTx6>2{eswvjL4z0C9(gq6ma0+3&3&VDyaF=P3F3_=bxBFMB3(_Ce6%NY`RI z81bvYMw75tPq7KU`oJL`GvZH3{u0b$*vhb%jK$QWCy# zNMn%wFas2jT%(ZG1^np{r*W)1s@Il#@9mAl?MWzxvT&d-G+d`P8xD@$lc`W zp)%Xy^_XLmBIiD%oN_TSz@^h)ccCOcwH&1dGfos4${i@g90sI(3@srL%BEVI8tKX& zRtB;fPAC6l=DzUPF@)bN@FBmbe^vp~O%r#ybGFq|u~~JtRj)y#AhY|nQ&SQ)_o1ex zw^(XlFSr&u3uNl~T(T~FzT?ieg@L@HZSoig{tf!!FNZKiIFsN>MC%pH1>BZ%`63RU z5J&R3&HbR3aGhqO_Qt=_J2oe;8%099Dl4e(!krzm`vzm16)U5b%_- zWWVwpD~4~^9Ims0kt%g+*`LRF2Wl-P-A}zoHi*+iSqAxzS`HTpr z$3YabFd)J+x1scsbQ|r@5?)JQDWd~F27-|NpxD}maG)q;$|mOU6m^8Yq%u>lP*{s2 zhit1D0#58XC6CL?2ojl}T1?g@VfAUAf-nvm%nA@?>p1lHGvAcTBF+8;=(Okc)A9;t z24TTjHk1l(N=X1Fg5KEhCZbvOy~bxiOlnvUS)Pj4`zM3VItzDCHO&k>H+stKHZ z`SS5qp(PKL-$*%Z@F+G~G-%zf$md{W*2N~|;>nAvIv&Ta5B2*OmgriySqt^pzkzFV?n2cAW zHrN3Y?(Y<@m8^Xuq>OvBzx>|UmM!(5Xptm;cW!2C=$UfZ^1@W}A?E0b)y6<&FLb+1 zSd)y_9Dh>(9f>70vsHTc?gqNpo`Ui48EDdmn67)uF)NWbo6VWQ8C~wa8r8@s=Gd#2 zyi8o>+|VGbcbgZegSn0xk7N(NgVB@urKb&W=Z|RN#MGL7h{?#eJ7wrwnmCc~2X`Jv z4!%ojWj>S@zOvGNeTS1E?v*1sVI+^v%MLSV>$JTr)_L{j(2cP zP2@~Swzq7Vt<%WAc<-Uy;+uVfX>WNo3O)mCOi`Zu+08xF^1y_7w+~ldv7@ZCb#RF5OW6_zuWdQT`oAIiU$1NKAmad*{HJJ;srud2c>FHuilq}OH zU!wqO@)rY?;N|&#ENk(fNcNE0#9ovDI!@uJA8`Xb;}jANRSpYYmi5_qlO?$|o!9c) zw=eF&dvwU<%!kPF=M#aI3;9MofcA;^(3h8eh^%*JO;VcWpo;z0?6ZM+@w~(+%5j&= zvh$E8_pWwwuMB+X1|uE4Tivw9PmkZNYO8+BVP{*3V{$)p$T>eoXa%hrO~k>yA8 zYfti)>gBd-!4wpH(oL3f0RR*Rf36yGNYIq!=Da>rQt;bzY;L&KALOL`Q@7!_3&XF1 zgM9teJHLvbQ<(okx5o~?yUuqTs4K7tu?4Z6<9pjMi-5BNJTk7g1-#2Fh2M4plYq1U z`hq`6xFCO%yuPuD))yyR>=&BZl%~GWT%8#od)pc7YD_r2!<85(Y>F>AB8{#|HXTdnvlW% zwI|S&EbQLFFg&N*_1T-8)r(A$n6%g7=J44Y;f;=CKx%jIdn7yZ{s$@ekc7(=hm|j2 zcC6VYFFMYDeRjnsPIXeEWZy0Pee^A2!Z^}!%lXi1l;;RDcvUhg4` z-gH5wi}RX{z;)-nc1pSC;ayVN)o{Prpjg&46)RV}SZpXvKk#%+t^_S1)pa`cDj2Bk z(&$u@lN6U@ARu2(<%8oGEN-s{!3H#W)0p)Hm*P}!*3Zxmq{yI*hUxBYaxzQdT->jm zMD`7HZq5RowA)yPk?R-0zx;CH3Z<7gD8F%i>3vWmM=3MPAZ(9|m-t*inL)BPl0q3s zLl=%TtsF`%E3YA+M}|g#GBD@S z#wxdSpS=5~%{uuynLI69Wm)Qko&UeOx^F+l?n`RSx+juq^dyA|}(DbvrTU`(~o zNmzwdU!fyCb3Bf;dG9K#%V&&<%0GwLg{Mt091e8w=xVSAeCbb96kOjrZ_hnx*d1N$@!Y-$U=THvbmj3w2SD2nw?kg0 z$e(wU$tmo;_@tC#%8;Vun8f0{nq!07Sy7sn)!sOMV%yc5MR)IUg%@Oz!gqqgmx4i< z{zjauJ8}{CMP>B#!K`ga>9&sT(^+%|@pG^Vka zA-&1T65DMwW)c8s?C}h`_kqR5WSiP&b&7wyFCB4F(C(V)Y36t`ZLxvu`CY+yvPk3k zlR7eTuRyrgkC0XJc}0XX_)B)+PVj1*4&Y&sTvb#DU}ajA;8qXDGL+V>gq?>U0(jE< zN!4(@Sb5TeB;Cb|klSJ1XWS$8d2pz+uvOZprp|&Rx8=dU4WF~G3_S0!TTH2tgLT%^ z3@-NAuhJeq2FptgJLvB<=Dr(iT{|*OyKeZUWQNFvnkOU2Z{N-f=G&nsZp@V16 zx|z{RP580X8_~ci!V@7gr_ZJB(?mEAd1^toM+&SY*dc^22V?r0(aQsy)u;M^MKP;$ z;!MQ$Ufy}kC${_sXN}!xS?IC8^q-MsiBRNc5zGY84N=D}c{u^(FgcR@4OWL*z_}qA z9n_$guhZAk$ZTp;|4$*vVPaqMbd@CBUBZ3(p15MOWMC~mZR)u#``>_v&J@?tb@&P^>pwN z^aRV%sa&xpc+^k4{nqOCisIR``u4@Z<3Ae9MU6!{_P*2h3?o?{-p4G$Yv3Bm3~}em zWZcKK7+j_=a{6~`h&fw`%NI9sUfxsla(*Bq>^p-h_A!@3o9AaT8mSPDpQ{iuWsDCI zvAId`DqN)`JS>|ZtM1D^^~t_h-%lo{`+lBxWA|gGeli$w zHkeL}HO^jS#tl~vuS5Yb_!|a9^P^KNkkW*O-DcGZ`qF;7+2lwZ;M4OoVETjhCUK9N zqC&|A@0e;>`O%MP8nsB+Qyy6KXH)>}pJ7f|GdE6rl8AD}>vq75!$ ze%VZllLbs{%H%FMt`ie09Hq#)s<(yWU&gu1PMkZl>L*_1mh~i2(>>=7q)!|H0pktW zzL21o+VcYmzxI~hFQq#FapHwZIhrL3p?pbv=)EAgJ!*A@l(&b-)6pBi82*?wD9Lya zmNo%zvZF-ztWJv#T2*eEz_Yh6tb~F+Qm`7|x3AD$M!-r@(FHM|dWoORXcW84%#d9B z$GignNQmD0)cY(1TDJT|qvweRtLfabR_gxemMt>M4AVr z3DbJFrhco+Z?(iBRRM}XqFi+MouIc;}Ma=iSdfXFvBIXPh&}{_uQ|p&)Di=bXQ*uj}{E@vO@{ zm@E@fE5Ug+kS@xcX;ZkocJ`C(DsucMlyCL8)%?o#Gr871gh{Udcj#eGI@MXBYFK3XmqFKeAnns3s*g&tTIpzAQ6Z zC+4Y4Ccfb96W{$ye{lgIm+QC1K<4$k^4>&|8%(QBOXdmE<}@)@>J6WTs4BHSpU0&3jN_@r2vn^T*p}mo!Q{-o1a$;=7ZpEV%N+)pLTb_epe6CmmZ>L)MUvNjlp`K|Ln`Tl{_qVLXrB?2^B1J{8O_ov<9{;u9J=3k zAH_3*9MAk!Z@6VK>tAy4bLW8e2d(35HAtCWBgoqOut$Y~wF~M-q33KzgAk_7A>}(+ z(vOSUb;TrN4zawdHM2rWykTC7-)}y-CxQ6rK+imKWveuKchS41IWWeRI$ib!*^cEyD{^S*Q(&hVHuto`2d^%)d&Tk)};?|@c|kWEz0Vwvh; zMzbtDwl^5_s>8;&BF+4Ww%!{~jVfWzuB}2_1=+m_PYIV-_dx)vSx~#k1fkmcGmM}K z3P{^9EZ~P`l&pubXF?MW+cN3`Ky4BPM6of{yqc1doZq?f>fE-8X+oz{X1?k4Ek-AKHmu6kTRFM5>)R5FzsTr5L zNe1<#8N*0eJ28yE4s;2o!50TjEc@OY$VVKx3#_`=m~ThpbTrl4Ji1q@EG={IH5p8~ z7MuJOv;gS95R9Eu=scRrr)o?@Up7ry;ana{?@+VN!h*uv@Q zf?Ov9UZZ1f`_=I`c|!+ys#ovcx|JnecaXi>XZgCmAYbICh}#6|WYgKu9oO$lr5{gs z1LOo(Wb1*`l4;Pd+7mZ=FH)Fvf0cae*;wx7GGh2My%BT|$4!}{86CD(?XXRkZsR$_buAA0`EQLYRMG}mNqxEhGm>48m7 z?CTHx${Bl71=421uG0=sh7YMf1>yDg8ihwzR#Wm8h-BXgJpKSWu62c}zhoLn%B)T0 z)m?Sx0`2fn{g!cL4s0A2V4Y`Brav?5Ff1}jXIiDtKCv1v61DLOdY*d6{-uP~TKDKy z%W|f31l5`c!l;&3m=6DzVAtacDG$1>=GI1Q_#KDb$j_sRdZd`R^1$7Ej#kO%*fpNu zDrTk5e-YGAU{S1W6?xB_=dEvwP&r@1uCA?f^<>fHp55J>aGG}25p_P{@fr7Q=M($1 zX&;oA6$pR(KO$2=1uFr}O`&mc?Nswn9%Rb`n)Cf<4n%qrY|Fwkb~#)c%13Mak}mV5 zfXdD@-d;*Q;4%q4uYKzp0V=}3bEe{8b_ZULyOxBmQVElR)?+yK#1rv%XLy3s61X{X z#mASc(?py&$FE%{U%Lm{eY|o|*OM8blOfVR0}I$IX{OKXcc9gY(3RSIA+%9bYdF51 z*la()DCh6nxZHB$9BVvVZT96(^BL-EA_nTDZ=pc=J-Cf8qs*dy>=6ParL&WV`vUMf5hQV7P*v31;PXS zM#Q}nKvF^6*(_ytrNX`DnA@rKY;x9rk7zFbYXE2v)|er$g@eQeJ_BokUX91Mw6^uJ zG9eNECkcgmMNea#bbs7C_?0eIzmFnT`X!$5%@6O=NRw$&jy> z`x~Cdir#`$tn8qG`_t9}YBFAY+=j#Acv&fPx6Eko7APP^vY=efzMVpUhFs|;l)mNc zkHlKB8#zctxc`FGxr zcb*qT;k79l5OEn?YnXgtBHXp4ewqDtZy2~@YyvGP`NllblrBS&`ILxXlaa-5Gi#at zxxe2+)JNv8;qAfRMbMM(NpMagbeiq6iLS4fUj;)ajdVfp zverYsAu_o}hQ9@wS9%t#iXu!^f0Xo396y6Uw@X{8Z;(1YO{*WHo#+p*b3vIRiDjxi z#{HF%L!R%)^P63$4qMSPxgGWNfbyV%C8?x-Z>j%n{a%9cxQR0JuWMqxF>`zxkfP{% z%l+IhlSBSnwbrb@$NOizX0>X5Zyb107^Ps8tf?Y^CWM;)8k7XrkdgK>J|>&H>*GDD zCZ=&y@8&P#@rq+{-!vgfTc4CP8>+di=&Vgf;@U-2GH07N*598gYOWa9iBCwm6p3^v z^ZKz`7%Ak1_#rnNVC~!D^r8M?;?>MFz%8K~i6)p}i*#P~*$^we&$-%#H=h&K>{sB- zRPMztaUN>w)Sw9CT{Alzi3aXs0K00W@3K zeIZv>PURl87~v19oKtFEW-%WL{Z)(nLZu}KA{6`uQ#ycUvvrz{fzs@3pr|+x~Z?qqp*uYJxj*w|?3*R-; zzXVp5cpPLE<=Dq=)@vkCWM%?rMgPXvGiL>gqs5e~yCa$q<(h|AKJzwikb2Rup+Q^E zoz-sZG+tUG*L7b&?#9RDv%@JjVW_`ep?oY>DaEy8ng4>3DM~9iIF#uVpTjs7IeAW} z#9vjDe?bxx;zNKTg65NZQ{qqG~4A%ei_lVK7Eg}3ktpGKlBDT?g-R1x8 zzyA5&4p@Y^RqtL3OBQ4H^akPo<}Clyju>UZPH>40W=XYvswl@1{q0T|MX=HAD%1aF zSNTsTdGXJGliU19!Tn#4=-=Ez|Ld9jhZFlZwbQ>;S^w{;okB<%pu2sD8^^^)#{#NK zk;MZdsrUZnmHk%wy{>3VE6JI|2z=+-9RIR33; zdMUubMZL~&1%w@4LK^@)KR+;2iy@#uY<5jStS2`Z6KZ1$itTtTw}#zt#1+BhvoN6P z-VD&#=J?wnG_-WxHj+6JC?Lf+0{1#$rJ(qprakd6JNybW>SYitl#7yG#D|- zfZ`saOpf zJRJ*+b5)WTGd9@J{8!H@`v&Km3Y8m z*ee_Y<{3Nej)PH$gSEwpuzwIb5kE)s-0*%Vws?+Y7EiQ@08 zWAp_$>6@}0c~d=JSy0`}g*kCEt`r#tf}VbK$FwJjWSYohNwHf?&L@sN>kv<`RcqZ%$U77B%IG5@);u!&3QgHt(Mws-WUiF_jK? zUZrZbo8yxKk}0+4#}^jtBcOHpGE<2sXkR@Z`rQUFIA(#R07~8WPZa);Kr-5fcLh|Y*zoSIhog}Hd=ZUKc@(YM%%Y7@;@w^A+NB9_1@*a6%Z!Q zzr-#d;!!DpMa+*WGtK}x-C{W2G^h*1YYH9A&B9zGD*HZiBC;~i(Ohr4-iPll!OGR+ zcX5{F%>bqE`NGr|K<1zA)x;Idde*E2 z&=vkG7H5?_MW#gt+(~4e3_vE9oM;xnt=*_fu>t9s1Z``&i(kS@F9V{oIz zhR3Uh^E?H}_e9&V>Vb{LK_)JGbx`$R{5zsz+hysJ_TwJq?@(Hu|OVMhXa1Yj8 zzpjMTuTSnBDx%_PHnY&UN^WN10=O!^v3^0KsZ?~-EQ=yVZoOk> zdj0iMZCWd9wQlgG*;xNQ&Ewnm_(MQ1v|<>31A|4kKP|$WJ+UskDEV5N{`4#W$Ori_ zu?)dP6GTGsr5@c%kKzk=Rr(v6I1VY;D}T+Q%vLrvkTK5Wk68Q5|BYB{9rR4WoOwR& z!FoevlT|!xuRZ36H$&!)ORP%-1tK7KpFlD3f63as!^931%fO$;6N)wsMi|CZVbEOo ze2k7jzLiI8N27pBEBx5YjcN2prA7apN&|P)fr;d)?Cw^kAtIs}Mh?>= z&;#O*qJ{s0-`?QEz_z;4q^!f3KhM!OQ)T=N%OTbl_sz^-<^l&ifTOgRdt&wY4Q}$v zr_#wlnf#V{KAgXk`B8>&Z1Nm}ljI2#(#icV@Xrcn8Ns3X+-hoC=+)Y1Dt(?|gVm_t z&RigYI17mVB>}owa`>GHf{5aqak+oangFfSe)H)57b;ZiKv4n%yEX;WZQ=6PR%>qg zOp~VFS)37|-r7_6aQ0l0aivd$o$iBcZ8VKJ`>;fr#~Dg_wLHn%XS)$sq4l)%V< zOV02|1<>968x@d`WCmOE42B$Em!tnQuYw|p)miJu@{-8SAghhj00QT^R_St~YKGCg zU(GG0&gqCZT&yxu#J8mW>`q+vpLd6C>%37l*!L`kmcW>0(br!;pRxJtYW(h#sD7Ng zHz&0d(Cul?lhw+T)vi1|Z$zd9!VGX3jDOiD1I1GV4u6+m>o;V$c;4SJ>l~!3e zPBc5??WvrP^IntJhc>EzXgSOaH2VReb4rSzJV}mfpq8VTx3zi@iV6C&A~%u|hD^ro z+^YWE)QYvCzauwkZCoG<_y+@9i3sDRFB8m|U1p4C2Uh?Lf*zx@U%zREbGYM5@p<&S z_kSIz%ehA05g%HuyH=l?6$(`ly*arvTBuvsatb+6Y z0O8QJ#1`KJyM92nT*+&G3kYBui46o>aVR+#`S+Xo6u*-;*6d3c7+2z5Q zQ)$w-vL|(e!JrUotIjC}2b%>GAHU5?e;N(0M_X~&(eh|LS6-!iaUz%`S;`$lyXwqQ8Cyt9@KWOu1^vSXx+kw+udlrTb!jg2 zD0tnhui{wQxjCZ5Snp+lo+@ZBHEHl(zwQMCX>_H}YFNK0FL2m~(8;31_MszPfk6|W zFve{j&$hfr0XCO6_#>ng0B$Ott=yU5a;L0QzexIDxQL8(z~5NBcRXU=ALFs4%M_=4 zq1~n~pIK9gUNl&k z$%K*IlRU_|4>(kv1JMkUK4H?sf5Dd}y^pY1Nm>Rc?y)5mfQG1a(kO$#3c^s|I?ne3 z8kK6m1x%(eOqj@RPG-4L1g1jNE_U#Vg>RAu98n3xU-44NJ|5c1z1>sBYsBeRW}d$C z@X4{Y(|EF-dKycOYiTk0K3+VPmOB`~>7%j;VuerER-7~F%_|)v8r^Ic@&zg_=Vbe* z028L*!iRRN?@+fHPD%em5}rdr+s{hN8hOfK;K&9RSoHaQSJ~$c(Z_NjICWR+jgT2r z?9Qm%{$yp)-C|obq46dAWdhLpaUYh%w*hfW-|2B#ujz~A)1Y+pgd97hCWYKFGr=CECyj*LwG^y%`GdbhbPY#yDJrXu6yB(wgbZmnCM zG25W@f_wPW%mb}l`rwjoJd@T1D_wQ2L-2|t#Altq5{$%8fi++xpMl}P?;Q3&zrhse z@<;2Y{buS%lk{IW7m+qE2!2F5u+}*bvCErF%hY?=E+!I29K;t)Pb~Y+o201x>jPARK$Klk$#N>f&_kX4J zJHNd9Q;uj#IvmU+D%V^lA}pDQJ~~PF?u$w)0`!GWy))jkByWH^`Ug5lz>GFL*Rrv% zw0^5~=Ja6J?aZl>Fp%{iICQsh<=LJaY|27L>vzqq$p`I8{+h9<29{7k7w?HNI0Vbb zJ%3q5=`2Q|078z$|9 zzFLloTO8PM_Av=e{0Dvfk|YrTnuAP51^8yTesNq>djuZxX)nR8TlobQT$%ankMX$0GjCg=;~$@h8i09fxStrzyv8jaDrf9xxAU&K@r0|0t&w)#49aQWHH7jV?w4iaVw3 zXIDu%LzYT363&B=1#h)lGt@GU9>}h*b%2fqwQhV|d5R08%lHuIMyir@2s3v4;smg< zpFN&E$V34X+eQC^==pEQI9Lwc=?%^ox9!3a#@8QB+{P5;{NpqMvHa`xB^TL>huvs0 z?AbFY;_GGeg!hx@*xbI0@m_tHp;?%rQJTI{jIT2^Nhkz}2c9dvVpJg~XNkfxtHhGQ z!qApidyb2fKzjcNwylg!3M)6GRsH$d!P%hu2Ul;+LAAkV$-35>%*L4!GqXUX$?E;l zg2FVnuY`7;SA}uif=145X6GLWhZNGvqo~OT-imV3e22Qc7C$^LUv_PLCVBWnQgQ0?g;PcZy>O2?^XHBq9Zba0Lhfle- zN*ZBU^=-Ewwze)}hlNt&YcLK`g%qyEYT#h6u3hF)c8SDn=9DF}#3@mP} zo2SJj4!(rNcqt?ZH>>PG&dn!7!Gbp9qpS`tAwdZ6hpEZG-^RAo(i(X#!;LGQ<%Stn zID{nGc$X^F=_JgXQFGQq8$^|C@?;{&d?eq#$d&lOT8nXe3ckJLe!v$*LTT#W?%t;W zBcg5b3)9}&=?X*;EN4K2UN;^6nu~31ZLz)UKJumRsduY)k?%7c6IT+AOk}!E{o||n zEk8e~r_t92EQ@z0YD!TclwMFGJnj)tlQ*?R3gPa{WfdZ`6J=yX1f_QNmF})lX;Mg4 zj@v~)>!Nby6T)X$aVHB4y%zk@EJ|AMxZr!gik~b47hWNOo8G#$S=05hi)~}z?kisk zEYS>>+r5jNjFTsSAZz&?m$2}xEi3Coj?Ps?Wr?PMwav#X@hqlsrpY?Fm@Y z%_O`M1dKd`@Im!RXUG+X1oM;aS_XWEi6>L#?a1};p5+0&akIn5?HKEqc;3^G>#SyWuZ6kdbh*nUt?k?RLoy0n%n&>|ORa+|04?fF zXqEARm;eAP4gEYV*v!EyctebZPHf{_ZImoSJLlp&jQlGnh|*$a z9BiZeAIEsmH(IyVb^e1?nt|42w(07{s#>hz6gVBnM7Z~Bklj}kWGoHw{t{IV%>N{+ zb(UCK92nQ-lYX)A<4QB$ypWizuWbUi@*i&y#yaaQ=FJ`dvQvqY8XNFgF8VDDmJf)f zEz+oWeynm0jf;`|k^=DIxzK{1v}D!`3}K-Hm|vWLwiI9o(%>=xc}hh|nk94pc|;m< zur$$Mkj>#D&Jmm>#b(>KggR+IjXSg7q7D3%WqF&d)KfF7@bo(>&G!K}URG=jtII3^ zA6f|WlYtP{NUUGKjx;v2#;IrIZ^FXSgQ7L>ZL}mZ<5YlUTfly0*G>2_^Iu)M`d;M` zDK!>j9C3c(+0hyYX{fbCN9;iP#|qF{{U<9x z`Qj2Mt8`et&RZ-T3|toU=06PjNeu+;9>23``2JuoB86AzTaI30vkNI7crf|I>X+DBY(?Z$ z;qJ%}rG67XR_UO%q#J_(e`ux9op}j4dDPQxs1)2HO!><93f3EiKk!xjJNR<0$F@rF zi%9Ift-V7{UpJI<+`*N{;!m&Uu80y4CNZ@Qx=tJEtICJftJ$nIPxJeBgQX>bc^ze@ z$Kk<7PX6Be8^u2>s{S`toap1k9*J;H3Qb`vBICZ(ZosZ3n|DT?wqgE%f4Rc-*b|08 zsnhj}<%m$YYVvDH@wUYtQFt*5Um5j{uiazNAU*SQGlhkolRJq`-Pg4Q6IVdpX`WRJ8qT^=I!U4UCNd~a%3%u4|@2iDQ@?@mc~m* zT02*sFrshnt4YI6aC6TngZ?sB2TZ{&`ejGduWqk6g)ymqBzEBagHQFp!zU0Yhm$ax zzK7U@5+XWWUkTLT@D;xeiIS{m%Zk-F1w)x~9Y zi{tn}{<<}9aBIy~v>({NjX&e4jibQql=$*0fTF3*7`2Li6*a#=&qc!h^5 z1ZY)cb#wbW^o(p1@P+UL0^#zGR~(&8!X60eU}W|c2MJ6J+yzolDN_)dbJcCk&^XdT&SCsmq6)xH$9sWK~D1b&{xW31!3VBp_N|ikp^pFv`l*^5#+{_ z#E!h7=QjnK(;`P>;Dr*Bg-lf0M?@sH_C@p2C!71NT9MKjHL^&fzOnK*$z8$>2aN&v zJ8Jo9-x#2MAfPS*nK;moac{ zNWf;_3qC5-h7y7jo@UshN!?JNrdKq{LI~Qw#=!Mt18b&!liv+#5srV5=oZp;L`v-m z+^qJaZnw>;^#@vv7++L0mR0#`?i+F*ccOLG%ClgenrwT+^ekGBPIwB^`r|y&>P@XH zph>{%d3Z%i2-2E=04z3Ve5eH$i@mu3v?6-3Ch1FoaBNC-F*yq-9p}9gVrs&WihO`r zYPIPDx`)6A*%u)C@|GsUZ>?az(dK@+QYCiB`)KarLvK>>3H%j!`AmtnZ<81%rEKeM zQ__Dhd>O9+TXg7bCjxL}0ia6AVk;L}i(O+%IlWKUZ*?{D^meg8#Po&JMGiov9=UqD zC2!Gh-$@57aAun!`hXXDIzP@9q{)O>`l!}F3h(T{c`4)_OWS8GIU1KFT+ma3cMmf} z+8VsI>$dQkBb1PFtxxlEuo#QX``D|>@;BLj@9uU6z|G@C^q2Vzz6fawEF3d5Y81Gy zGQJSIPQD^klQ&4FDM50Ucq0=y=NJK^GQb}si)_nLPV8VJ;JZNy=G4mFJwDw@0qrCg zAHoMvBj86w6+Kh%5R`(1bw z|8gI{A3Nk3G5P>VV*dDE|8gG}kA3^;?{LZCA5q1h_5bVTvTkCbt%$t!AK&WlkOMyQ z>dj#ELtX!e$NN9%YAjpC&ynNt&jPqI@%|ep_WOXOmBH=2!U`Ap?@2+hbRur~qD+jn zKY}^->0*jWX*Vg;_5Tj2F>uxH{|17cts1{SVxt^k>@<2`79HLGJ%%oYM52ZEP4{N? z>WFS1`qr@id*U8~r|}!H{QpMmpLELqAjJM#nfy;wCYn`E8GpC{fIj}W&i@~*^Z#c^ z&@F|Z-xL|H?tOtk4Sy-{aV|1wUU+SWn1hmsrb95MXl}tDR7r%WRQn9PD*M4}d_g^TGa@8jj7a9KT(uuwSUEYf{MT_i$J-;DKiB_gGIcuiMR zl0rDBlWld2C}{igT>lAts-Z_2h5OcXmt*eLmYO>D$wU&wRd;Of*T*0Wfk!MlXEZc4 zczf)0&S9>-Gmnjhs}`O zh7(XqpsRzGKIp(gr)jbzFjiITrv03bPRzW+73qC%bA$J;TBUoL(~lVj9-xDCN-U%U|JZbOb!rTH_vG2bdCamP)v$eiMsU zd4pRCD*j`udCz!ekz=K-=j~l7Ps4~%50TTN_L($e{-Z?=*TSx_QHC%{qL8$%xS3}g z%fm!cyXzJNZqTV@xxJlE?h`|4-l)S0pY!roxrSFrtg;5=@*ENL)EX(e6*S)1QKu>O z$P@xEZQQ!HbqYHtc%0$Gw1EgW>zMV2i8OPUsYFYQ5`k~XHE_E<;jY7w8d2#)zZHgJ&pwp69 zjpjMxur(sht%X%$Z=-5%l(smtI(D4HdAu8A=J8hZ+*nSGq|;cZ4BSBSe79yC-*#T$ zS}g1f z;URi^|K3l?`LX8R4JQkty!1zQ*X`bxar$eM#=Q!=OeDo}%I$c&d$Xp@SN&+xfaofi zU)KnTh7mhRgnSrx^FBK$v#R2mP5uyuLQIHnRPHssoakmJ1hgxd)*3%GMP|vL%vi_r zyG-tDZ<#+ywe3)bA?tOQm!I@EUEZlVX%L%S)q>m&k2 zcf#Oqd3S!GYEmkD+@qc+P6~SC@$}AIzV%C&a1WT|$gr|V3BgrK4wbD}6l4(XTW7IS zh#%YVg1nbV!-Ja)9zR3o%(eo0^8Iaj9o$ik%NbNHem~|4Zfc$mwD>~>0ye8MIc#qP zo*nYy`foOCay?&%o}^_})JPcALQapDGl{rLYW>Xlvr9E47Z*(F=Vy}lJHQIob6KC1 z5NkQHK}+Wh(bXqW1~k3}=Ow+C3lpDwesW!*C+BThh)CTz+@bTvy%M~dF zK68lMiafZ3K-SJ%MclYi3){4Q)#Y;k$O`MalU5ey2CX6knT8~a&^n$zuW5l*CLHD? z4$|}DI5;OkM)A$Qdv(w12ftjUZQZPy`)K`g^CW!DLG<{QTi#s2@~gy-BJ<n*7Df#Tmqpsynwl;AK!F2Mh?(u@u0jnfM6Ba7H}acyL8b3R#~n zmDZ^9Ml9#$$r7GR`lv5P5Nf-#;|T*wX$o#mh>uVLe?ZPGh8glkS=x82H+!kZi%4`O zc)Kqa+T=tG>Zqs3d{D(4EWbMlDxq^J1%>3hC022pjL zFwTm^cbp!|5rtnB_G@?Wh_K08DM@pq*Ef9>j&|;2*7MFbk+dCS4AX2$b1tB|`1P*W z+Q-kjDJfLB-k+lyJ`Gzg3|y|gYwa&jj>N&hxAL!($V zt2$drcV#ELRH`G3Taf2>P7=;5Rkgvs=(0?nH`4T|ix0ZUK;uzUJiYEuW{kbBq3^Dj;LrkIVe3eB(dWHGV{Zh6mc z957+Ci?ftQ1=mg(+8*4|rA}*_yE@mgl|C-=dS3HL|L!kZ#oRelk8k0T?>g4+g-6P? z-(s&M2u{Zx8nU4#(-9{WB~&`Rgr2WeCTbZbHtu;DA zS2t;0s;xeXe6#)D_u-nrWBUx@JI?i8l!z*?)1^8MLJP<)Z3!&%d|G4fi<_UWp<^yZ zr)1ymJ4pR*UcpN{`d>69#+9gdsYDa->Fj;Z^`n#B6ojfhV!>|-2e^gOE~JaLE<1=f zF~V!BMJEEcKF%Gs5}7oghpqY_9{)188aI=IB@#;9&uC*riv6miT3zjw7!n?+?0Izi zIZRiMC14nH*T%N?t!*soI^o4T1tK6j7WE$Ushcap9<9@XuUK4rk?#j=x;=?jJ#ArFHN2+kVT6D^TIm3!i(+DJa z_9I|{ht=B3oLh_xU1~-rdB<~kINntYwmS)*ZBb6um?J97 zPf9It!34qNs?7)l_Qd2tu>XBvJ)yZU#%#ipEP6Ni;asfC_(QjcCxESGMr3B~XXK}$ zpWfpZnZ;t;9mxV<=7_yBq)!EVw(sMHq=^gNCaMQA++S37Hf(ceIK30v8Z_@|(&hy9 zH@hqBXkjvK7e3cH-}DYE!*?k)o(*N@xzok0t8ZQdcdO#5bw*1viRpuvI!z23Z~4!s ziO?w*Q%{-4d+xuu1vm2bL8jW*%q)Wnk4Dz&!4#eM>;xx6Li@)L>=FAXfvE+4H-Jw? zm>j(NVOUS9ac^0%a(BiSg48Z5_CwL0v%B>879^xrGV}Q#H}F6V%mQm#XT%_T8(X!f z&b5nE#^e1NO2Vrq9}Fb-hXyKRe-VK!o%GM*t&EY+CVgsE*IK6Bip-pkb>OzsKk*3P zmoX3erZ&qV+|i?+D7mv{Rf;{Cpc zk-($G_E!3U@+5;K`$Xl6Y$<(IUvehotUxoiA=CS&QH_gDzjss7!A3Zh+3nBkZlx%f z^%i8|e2WbP!AK)@ygFEW;LT3CXY!T`yYYV?`*CsK9K*mn_c^8BcU(6&>$G&_GOKlSL3@ryN6sokPf%_(q`ODwvdUkvk}-*Y|l;L|y5McsrYRf32X2&*bqJ?W(Ce zLUZ7Sso&>DAPY36>cMVs5-q75%k3*dzZ3q7BXuNb1+}%g+zWpy5|74w3t!@JP?k zkua6RvuTI<@w~B!FvE9kgE^?P!>>ec0iDpit;2Ii{aM=CYDd*l$c%e)>laZh&YN*b zkffB`m(MDC=!y8$cL!7x;LkoM6HR`eKg%fw!GV1Og~-^oe(owyM^QbTUQz?q0G`(A zU8^rSyp>4SYdH@0q`_MYU_$m^@x3)0#W{+o)KcEQ1upnBqd!l?E@r?s>|CS&;f^VD zg~nLr!zhBTK|7Nso#Z?^DXR;-)4nQmL)?|pil%n#K`>2+Lx-L&tts^>Ml+s~J|MitF|wl@t} zRp>I5Jrj--@A5;3>0C9}0xJ?&PZCslClDX^v2#}&^VBsx!&o~em0NmM$6bgJ$8rbj zFIXUjyxOR`r8iCs-I3-=dXzS!qk&Of^JGthGn1GkSGyt8L&NsqJ+XHuB8CxEG4{hs zb+B8hSDEro_lh#lxq~E7CA*e0@*-pJwLJC+xW>ZFwU$CLB*R8v2J-`6%>_ALYuL6+5RFi?Is$ z5CJXB9RFx83z5rHG1-rgjo=ZP-aZ*qYvKzCS}uQ_09m|E#^(k#u7R`kTgOVi8Jk)2 zzGd9UQxr7fefWMoc~m{%?T`BLf*d)eR&&QL3OFJ^Ta&hBhW1corJWlkUwOuws(lC= zOP?K>9~72S^zyCh9(RWpw>!@JsLCmOu=E<7}%^ENt13?py&FSS1 zt}(N^+-xt7CyHANl~bLAY|}O2MW*Jh&rVP0XQ3=EA5KOjT7ThlJ*7x!l}PDQShY5O zZ&C=aMyeX9B1QF88Oj8k*Xf2??O6iv4N9Gu8a@i=uv*OF`-&`k;2Zr*5>d2RnRJE9 z_!fs|f-ns90KIG`T=1URw@xBhH5JD0aN<$elG`+T~T%XK+< z+`{e<{zLu7tE*LoV)?oZ8G&-6J+8=%=wW|tqi^a#rN|T9GAL%yft5?<2bb>;^6t(a z#iUi+RoR!|6GO6LL8ELgtpR-sd)++`S{t|O4L<%np~_8Z^_>=&Lv8t6^!G#Ec0Y`5 zAedW&e2!KkAU6=R-mUImeV6ls9vJz-iCTa4K=3k=8xra8ZkR>p+HIbjDsR7+@YHXm zAlyoxdn3PZJ*{hX=^rS&%WM?#;QaD6wXH6cdtT-vyQAHZE4ha+k0!0=tQn4#?5NLA zW=lF^L}p*jHg3MLs9AF}GjvKfsGe+WZ9*1K^SC7_NA)2HGgUf=^hqamFt3DAhOm`r zWVy|N5K~9>2Aw@*S4k{-@%xB|o6&N)tjzmi`@28L`Co~1&e zr#}$lFIgxo#;heR61`HPievTI^gkZZGTFh_ovW^63JnvKYRaWyD6-w;vvc`OiKXgT% z)q1COrB?_(F%sZ7yCe7xt;^FC;*eU!s`!b>sGcCKU0QSvkAV z%WPG5i?oO{eK#yHjGL5#-PbjZ2%58K0DBoGJ&F}@cE3eSs(jy3aS=P|k%ES8{ip)s z?Mpv>ugKxR3{^1<#g?~6JA)I)=Poq|N|nXq`HInYSTA;`JHvB?tmF^Ub_9Gq@7H@E zM3-%zO*)B^UqKu;Zhq$O*V8ydG{9;`JAo3;O1Z0j?D)22Kk4!>&4;M7*>ITK@kre# zUYZ34Z8|rr<9S#A4N<0vfxfjV*r3??NQ_A8;h1(qY~9)6AiU2vT>P7ccP=E$U2j7-jwe9J=T%bd^-O-$#CEesm)KmdV?T3ai&uLrH%ZzSzk zRaQ%SkBy9XK3fbg^4wUJP|y%dJk~qoT3;B4q@}d>KZ}6i-dAp0#z6nbg}VQ8YdvC8 zxa?tCH&?S`NK2)*|a+KaV5gPPrurNUV z=*h0w?)h}9>Y$I3p&5~4Bt6sJ4m#z^^fFiyv$$W^T9%+)kY96y?Jsf+z2 zRMwoSad@Y{X{u%U{l58q{ZBB{SD7dgcXJ}l7Kpe&1Xy%T3@fSl+(ivnW^6K-7Rz2PT=jW zXsbOQSccSxW{;rLL@tw)x=&W9pB)4p-Wrm3PWKBpXcTv$XQK9b{<%ndK9IMV+iToh z;)@X*cisgmS)_Qvsw_jBXvsA0g?NxhM!3Pt6!&w;rm~waP9;v$ZXU$ND84TYQD2WR zXeCT@ZBZBsviAMD1tLa#Y`bI@y=Y!e9r(~_i z9cLYEnRKyYOFzcEZ}{(PTcr^-!CW6bm1f1_h8CUlBjj|K4XFf`xLqYuCg#%bQVZdvN3R7A90PJfap) zk-=Af-Mp2ZlT7L(yf*uD;a$@b{PL|?3+FV{tdr35e20qy`ywOuTqi#(tGbDNDnRaKtkXR_p_h9 z#~J6GcibQMd(IvspI~IHtaZ(6uDRx#|KDta)V@*~h0$yQPgC_%@qGhQkE@H5T+eFn zqDDuHR^tJI{%1B`kY#N#AMrR9jn%r|*R0n!pD*I){JGy^ntb_U@%z}V7pPa?^{N+~ z6R1m@-A$_QM3;KyF!iFrL;9uu@y9NHYh8dwXWb?l5$+pJ+jSspcLSmCBBBbFHphOs zoOMAPwki6hDd5dISLYVWJ;}^pyK&USAX8l8Q11eM?*La@C_O1Q)K05t3xU% z=aXiAJ|jY_y3wrtu*UU3^rvjFEqqCDgq-A67azAo zGOyC;M-}}^S@*j)pn0D@gMs$(*Gfj(9r{0dsst3LW33&Kb#`=7{3!9enu*CqVk?|; zGV7U1gHgl%PX<~lFnw)9asz_GeU4|bdC8fvoQ<|tr+ZB^(cSQeTW`hat&dC4!ZX$+ zB~I8wF}W&6)v}_3j6~52CibNwySUak{bajzPcOdL0JUNy9r{F0zdqP?C7Ve3j5u?7d*Ygtwg60YXc^# zn+ksAB~Lq)P^J#^j3`3x_B?y%>@;+=Q2ac_U`ojBDiw=cZ_1ARiSkuyeAD7s;=%Rmh}W{45?FuOboLw%fOfY|0&T zePr~h(bF6p$GI}fG+xy`nK^$_67|v!w6*^eTe@1*6_*>_bZi#jKDxotfLcpaafRcf zV#hQ9Ii=u(<4A|QzQSZA4fqG?H!sBx)bG%S6UjyAvS)IN02*3q2h%Br4|MS8(vMkBx)bfjuJ(7!Pee^}bE0U8@O;vvoaX@>dpP z+>w!c)g>OFf!0V2syAW1uf;+HbIERe)K>(n@b}oT46SeGR_8-bDQtrq?sQ!Nvuu8s>H;j5Y>@27uqSciw^YJ*y@bz&4V#BLVo5 z6Zodo#KP!Ib6FA8&Wd3QxUkuHA=kI@_yjyrZ3f(v&6sV$ZDaB#>%*0r9d%|dNTEY; z6-!i_M75D@nEqp}X78~nOXToYrK%?8Sf%-?Qi%#5{v{3kcqmW<-cjJGCgo_pA(t1| z#V69q$NVa*e%<3W>FJzeA~T za37J2Hy?yAwt48ofzd~4h-DA&7A3L*2%V~(LEvW^fSF4TH(~-{v#m#8Bxo?FRBgYE z);U`=ZT?_X#hCu<#UAB_bRwaD)QQu@*nY+u<9yyyDQ^upq)xp9k10S zkB$7MfNZo~&`|@a03Tkv$h0Wl@+SqPv-PmP%2Ot^l!)+GU_JiC^EF{1(o`_NXEpsa6UEqGKn_+)@8yZyK{*-? zuE!Fx{2jwvjGKK&)femiDP#HdMS}+By7d~zwRt~&@ndYldfR^P1t8Pf;@Ce2f4qIe zhk z#wQ>!^o}kbv797`AE5)dLUF_PDbi3CunUh&4~KUc$jDE|;Phl?f3$?&tpA>CNh&~& z9W!v(*b>X>vJH#UW2Ci5rB6E)Dd5=ImP4{+o#?vOxSV`9r2AdLd{O+)5~TRWD4GH! z>6?;dE}Z7s;+Q)!2Ax}Di0#+OenDb(@QI^DH`F}#FjA4_3E#nx!VM3#mUT^mgsr}J zU-Ay-`6j_A@v~#9loA)6abE}Dm#ZI1r3Q+s+j~jB9F~)D)R>g!aO58Q))j)C6y}$| zB`6ztCVhIXzv|G#=6#Map?2US8G7Wjr^eR4rqabrMUMS;7g>nv$bQmybIzMbpi?8# zam<@m%54EnC9>pEQn7SY%xPy;Ky8_YumZq-oB4fZPDB)eYk|WeW}Et8oA`I&^Ews9 zbhAMTLyI$`j4dNJ3K-#38|Y^@^s4S><1FB}ulvnMV%)|kLx(P)@*SO-fO4U_bKZcS zX-O}~@Kfyb^E>JF+y!ZrQ6f^7$<6DoAoxB(gMxaMsBtKpfqzp1J5O}A9xC3A zkmHybJS0u!MMC*Rx%Vu$8l;^CO3_rBoGM1aL~%+x@8ZmNR{0CSl!p(0pZPTXn%-+Y z)b@d~qV%VBRHXK0GdwI^)EVW_Dj%?eTdr7^X7B@r$Gf0?bb_L_{2^Ah`O<7K{Ky0F zRm)oI9=lt>kGfAiWjT-Y+XD#GG7DxxbOeVwI)SE8;Kc3Cse`k7YC=gt5lM3_gEP2^ zENk%6Mg6aU`!WY_0VvbP+Nt2`MAQlXu3yqrSNPI`&^|wL?6gBwXpi8-J7}1~@i={2 ziD=cvMa4!}YP-#T))a@Gi*AQ@X46*L`AuoUn-;4SRLPS z7!WX|j8cGt~?a2=cuZ-Yw=xC!xOSZZ1vl==ksy=5nl*NERby+v(>@8mcxT~xBg8vIw4sQAkHukfW zLD$YNMK)5P9sLK>{+Lh*ko%a|HN}f0NrAS^@tZZ@oOevF;PfFj~JjrxHr?suQB|{^wSCVZwbqjSJ`Yd;=-VZ#gda_wwauV$R!4+|B#(5E+Cs>IT1J`!06)5TP zaWs!YQgppcLFQd-1242M#9N90aE5ebpy1H(E6d_T!=X*KNrwp_-nQ!a!%^fj&ruLz zr^^LO-hGUa8i=Zw>3b6E}#0cIHjPhg(A}T^7ky!aU;gUd9j$mNsn&4ksS^&)=w4YbQSKPG^e^R- z^`*`Xp?NcdYeJIpqF#;2(z3Z|*Co&5k`F_$R+?_#g`a5`q~%#2sd;O2zda?nyHgDx z*lu>$|AHM97%+9#<>Z-m1Z;~-7uOA-%ixMS4E&88GkaxQ4UE<0 zJ`fqWj?2Vjx=;#&eE!K7X>zM}m#pi{nk`9m`;Kfzqei~zdJG*G0UU9gpENY^H}vy6 zJH7X-gN3;(u)WqX*Rqa;P*y#4F{Rl4U8lDXygZ%NUTM};-i%w|&|9~gc(z~P0iP&G z8QAFs>Lwmqsl1Yc^^05@<45~)4}Bg)oUKm3CKRs{%B6W%O_a>?l!9bC1m4nVAkI2n zGAlpE%kV7oK^#y);G*sNhSOelN(a87Lc@*}Yv3=!yrXfcBG(XKF2 z*juzLJ#Zo+zKND|&t23S zsZN{iiW_(3>X$=4qVMgMEP1adJo7Sr9hP1fQXyT`o1Kl6MiG>x!PigJWqW@;t^TVw zp9*wk3un}7b7TJ&Ba~4inSX)3V}#|JgJhDo zKD|dVgix|CmG;p?bb>7rn6M_%O=vUhg8p-uni8*T9v*ZOi|bz;{oFb!PCIRc#c8ZBx6o*Lj3F0~r@~=^zLTO8` zx5`vCh2)0wPgD{QA)cqJi%@g5xQuC>U2i&QMz71qil|gFcHNrR7flk} zF$es8&I0B6FQUkO%y|qg3v#?+ZV)1=KHuO^B?5Fu5ke)^l;qam^74XeSJHTO1I+c6 z6cMB(zLfF|fEW?(V4@Do#6!}*vyq8O9z!3((XM2qV#FQ{0)q!DV&!_wwAKO%w&==U zqYrd4kn{wuVh^3Ts3%qnMWdL+P#a2v;`q3VRHc>!JMGY)Su;BZ!uxD~zN^OU%I1P} zP&xG3Pf~vbp{HfimtVFDMOK;_Uzn{2iaEk5$r}YioAvy-9v2&wvi_IJ?l0GWu0-oF z>sjftrKMVaUoRne_o8bo1j7wqdNFjyBtOeHa%Hz-9>T#yfbj9mDea+|daAbUC&V`$ z7abRsu|Jqg5{Gu6vTIDVSN2=~p86V-qHUf~*5vrNRWdRBy>E4*u~EQ_yc~xHM<(f> zB6oG-Fk5HNz|?DT30(osh4RNXUP59o{e9A(Sqxu~&lebYh29&SG6z?j9`Kdfud*>Q z(>D~A;s3}f;VtT<#P=GrW-FC7cGUWQ>#@+a! zW*&Fp;(;R|I+!_0D~ozO%6;VUgUwN#uj0Dr0SiQ{ z_1508Czq8P6{U$xK~;H#*rm;;APd>3n3+_+-w>gwMb{NF$Sta<0~)OV;|dQrdOq4% z!&#e9zdxlNaC`~g_(8A1)Pj4daPTk~DLS|n8)m}vhE^s)0AVrGXR|))8DO*&i3@96 zGZQ1|R0x4aQM@bNODR$kF@4EP*zya$ zXMgSqQ(lic?bI$kw-770vA0-gHF9m9`!0IpNItx-sBzN{;FK5Ce8GMh3A;2$YXQud3ciVlvU~ZY zy$y59yB(S2Kq7#29e!meeS?vpl_Eq2HCjH~9oa2tg@UK*)xtJGc}9O3cd7&*=?4`K zAC$dbuHzsqrg=R=KrO(N(sAoLkFBHiM(Qq#Civ4DBdt#!zw5mOz5YJQ7yNigTiIw) z&cLW^!Rkxsr(06bi{rmlq^h;huA9kbh*vl{G?v@AeftPbjl!DOy z*nwL30RAV+wEkDhuMS>RqBqGJiSc$L*M0KQMQw18?Eesn~~8`sT~4* zege@23MqORdmin>U?nAO@out#DRbhzJzWROdv6a7?{^Mlklyy6djJHwKy^CF zV5|l+pMzV3Zw*tv&U7Lgo)Qd@1*;lPP^?KUJG* z8GDGexZ0w>d@2R}Qg4(hCnc-Cux`oS+K&+7qwyQ@Tt*8*(tI9X>oHv!cXKI$?klAz zGEme!;EY{tB16yv_3HLMQ7AUbh);X$;!E7HOy7W z?}cAjLh$I#!}NplGaa}mmsBQUf@|_#KZd&Y#QYZQV;)tdDtGexvarJ+AG+JaS;WX#3Te#|3bH?<^EJifa z1c^$L3g-s)5$n)%rbVz+rPqIeRJH!_PH?bJvRquX?F-f$bCHmbJDuk*3#S!>+f7t3 z)B14g_L`3zeBJygsyssD*s*;|6=ZhvXbiQ2v5X+&lIzLw06Lg-{q&Ea)7K_e>N2u+rCfPCY(308t&vK z4*i*9BSK1Cfdma}dru~MJo&hqI9Du%Z_E-%)h{3J3HVjKSNwr_uM*vt##x}@e0X<# zM$|@0uj-CQ^^H^NS|8lwKA~4hZd@;4zYL~N1c%Fl{1<4q!rYS2L!K<$Fl<6lQn-B7 zw5gFAkDW)THs}N_Meajy+h<-G_cm=Euw2(gE1R=I0J+8D=Vr4hResrF!oC8nR+ZYv z-}DQNjT-l_8~3hy;guEKvhXFp+2cG$@>08F@`cHJE5XW8i}g{ zQjl$ft&Bh*5Gk0q(9|rQ$N_nNU~5X*mq-SC13ns2!LQd3LO;=Y-!WKVWEv+Y!7_S& zHhBxFj9oqF&--b!Qi4&u(3+6!vV2{GfjKMtCi(4PiD0#UcJS$+ zIurfJE42x$GMay$^Oo(G=L-hV`|uT-nzShJi3Mfw`d{9|^WPg?#@k+$GknV$&;b zj0Hj8(d6Ne?+DQ*%}L8^-$@beA{cHTBkigGZlBxGk3x9K)W7!L>QPf6g8A0C>>Ju- z?$%!&OV$(U!XQ=1?d60+?5(|!K3|jRV)Onj(^1V{X(|m9eEvnP{Ca)C2>D+bZwVgV zSgC7ey!fl~;6d3ruJqB?c;ZnfM0r^YtI<(9QH{+n`4$KUy*b+tX;v=r(1cK{A`=7& z`WkTKK0ZX5598^_f#G`m_rA-WP<2R}c1X)wUeXnO-|q+QZ=Y|LLaHg9F6CEV$kVKl zXi}M`TS53PtO4&TGqbXNPm!J5h|AV>|_g=M1?UCX3c+5T>Mwo@x zheC~qJX+zxXKvk*#!`S7h#E%!m|lzJxU~`Yaka=y)<{o(cCl#GQT|Z(r-Q|mRDKO0 zRmNOXH3~IWFjqP`l7wm6ZUk-lnc*i7hL^t~m-*S>>&5Vfo=bIk+RsoEIT@mwpobv( zEwjl{8<-OJF5%2`be9WDz15QN)@X9mn0=yGf(=TDHJ3PnC|ls(t?GR>m;=+qbkE0@ zBMPXT6a8g>$_`3zAC-abh7e@fRMaT;e{k8hcW z^mTVewDng{;d5zT!UsQWZd8gFAEil+b_42rT`K$e!HHeKdoTvV z2+x@Lr~WEicQH5d0BCxBS)SYNbw|9{+u=~7@I8CpdA*l-ZKxorYKJ1=86o%S+qczg z6d+pM70}?`Fn3o@2yyIkV(MV#v#0cXLDnxAAx-0S?`;5p`ANf0-JHqL^vepK_@viQkTPuA+R~K3_*~?QTxfX0GEhJ5Nz534XUC2 z?97aQfNbDKHeV@YY7`77A1P2hf@~XHDsR^ae9Ihbs@m+I2EhC7?)jzmN>H(PX*i7Z z%GV|}Z_$>g4qFU4nru$yiAIFl3h|^8pxd&362w1JC8@ zpjHe(Y-&{EXK$<6uCXxPKa*8ar=9-F1 zxllX!Y~erCO5Oyy$rHUjqieZOUuE3fER1Qh{4*242IU+3bsT)hs5iQ;Z|=Z?pxHrw zSFdI*cpSX{T^;uq4yvyXD~WO!0t&6e+@Fi8woYYwt@R2E!(}&(;tW)Hn%}LE4X3(l z-J$el+j8kjM$l&f^{Co3 zt!(3`u#xb$7I!BkYU-(AbEDa2FW~4h=I2*MNsZkuJ4^#cR>cM!X(jyC(gXRg@o+&YWAUq6ffDsJ{*&+%mL7rld!z!`$Q49MnW1 ztq()MUd?Cwq$F;3kUGgX(0k$Ff#idbDBg!w-d7iOh*BTyi>lgBloaDZwi=%GdA>c_ z!V~sm7haCiehaE7A@*~ALJ&gA#XD7-R*E>|Hb8&R0aS@M^c=Ds_Sw^1Q%zdal z#9deHsX*Ws_v~?ohF{qf&!fQkf)Vw=U>ZK5QlH0)1J=i$ zK5%F{L0W;Qt|8$6ZqZ!t5R;Qz5c-Zc@443iDykP}Nq8w^a43-AE3~qW z@(l2VxT}`|oCqojhR}?p_ddu9wHc0vBz{D;Y1<@u@VF%nW>+m7M%TCUgc-a<)eC4# zZ(=6-PjP5CfGugy(y1>))Rk~+Q4E0cZ+69*JvPc)_ra@NRLust$+8dCt)T2OCr%W2 z-96MjYnGk|a1}QZTz8z|c+$qP?U|l25B6~Rb(mV|i_PO}3FZembgk`uEI*A;ZASkB zrz-yyOhifkD$hqJM)$q1LGGfxuY>um&h^a_485J=ylCVcn2^!9+4ZF&N%IQ?sM;9` z6mlwGueyGWu5qU*rzAdv%4Zlh1l-=4u(=;gsCFVxSjVuEb zKNO#TBmlN<7_O@eWt2RQlMPs|VY$>_FK0HY9Q(^}-i;jqL4qM9U$k}M9q5gddUSm6 zcV*%sFQ{WMQBQFc6H9y@rd`Ech45YtJEg07O?V;Gpg z^}TZ1m!M)@k)e9u%P0VTyj3rosyk>clLb4}m+A3~a=KN>a*gTJ*%7iRbE-eL#4DfX z*^V6(uDz@~`w4itRhc7y@YoDMTxt?b1brxZL+jW6oyJX73|mPh>xtZMvfe3vvk~T8 ziD~bWUwGcB3I6i+R0daiEoAy%;rjj`kiw|-a3!z7-?7meE~wG;M~m15rqdTPhJP^A zA?SLyBbZ9D+-LXVt*rgqp#-*%@s#*{?=QD$-h$4y-=2+VMvb^D@0Te?Y6NhQn?OFb zizx*=1EwHYSb&8g2tZw!zRna8zy8cE$C ztN<>edT^J!z}Kt z969iakm-7-^}f?c$nd7id5HR`O;1hkw9m!D*%Ul#qg-$>u|4+FZpMjNbsKNr8yzZk zu@}ODvb`V7h?&8X*)&t=on5kaC`vg5GWGc+n5$n5jxcL&2bc(OR@ zN0vKXukr0xS_2ULi&03>S#1uX@;B4KM*#q+7DMoH2}RM>ePQr!p04qqG5qVVTZos> zm2^Q8?_CbocS@I8;F}hOqn_V~;7+^q4JdwXmKc!+y%fi~FveMlfr$M_BU*ePWvaItBa5%@C1t`D z4UR>y$0_Oe>NS}ywObE6KO5MMW>FfIqD=zdv>3>a&L}g=utOH+^jf<>enhp(7t}D* zytdBcV50NMc?pUheOpY-S$)4cokk8wnw`gDRADEhV29T6(UV1WZmLGi^!-wbG|OWH zVH^swMot|20(~2wr`ds@wA>y(d7QW_5HO>C)`Gsd%kCtXfGB0^szj8&kC)7Aav-(J z1o0Yu8b$D2`hqCd6^1--jDP#s`++Ymp-duqtjFP|37itLO2(F_xl+q>PFdYdZSErT z-G;w##*XF!heW*soWM$C>*Kb2%Qxlg>yp}fi^FSp6SFLx(Bs-uwcDPFVu?nUsh1jm z2XG$xwTJ4|LiI#^Y_7=4T`9k7Z8l3Px*0wDGJg zGt<2=N8(}tqk+P;S_SJGE8pK7dY$VNJIec;dGW0U2_Qx1Jom^@53!hSgQad!!DDo- z`&f_cacJMY(eekYHgsgA#=|`mKLi{^9ZNK269NLy6`0$jyPURXrVZNxgxkp;Nh1wx zElPC1`W1tp!DLu~k)>Oy@AWku&fD*69}2r*k96~Er;H(9RmTsss&Zec(zaAA=92v8 zg_N*fp?1waF?EZ5YhxuG+z@-)?n5C72F5)6u+t@CUrp_7_ci(3Yl8uTSNJkZg3yZR zzC0Ui`LWgu=2 z`MaNh{(Yk+KsRY?AJGdxp6AWN}}{{pqL-!*^1kp=W|#8$o8JkA>(;Djp3# zv(*2N(UUCh=RhBbpjN0~=J4^?uDr4AjZgYa&{M;YFuH$4P=%_<@(O}-w`1>TPaNb% zod+Yn^JUeFsWDRBw@0mwpz|@pLJtbD=G$3*2SpX5y?ma>JKwIU#kh)H5>e19CT%A% ze2+3?+RJx4Ut$pAAoOq^Pe}JXF7ZF@ei$9X06*=pOtDEUgCuuL1l>>tHtn3u{f=P5 z6r2T2=o#D;G6(IH^|>X;VGy|J7?0e~bo$A6^o@L1MT9_HS~Yt-R1dSULL!FLqg}aT zh16F-I=&y-@@4)FPu|p*)om7A{`f*){OB#kbC=~Vb`ZaSST0~~qN^CG=RNe&;}ME} zROHc^;V-bq-LWFuA@HLS%{3an*D2pOo2>48j}Yp0x48WbOm|Uf4Celo^Vp&>JQ)7W zAz-b<$a3wO*fZ6}Px`NB<*iDWlklXgFOPd7KRZ!WOKG|47?ss1B5RR?zO<#Z%_#Q(jq8M^?nlsK~e^xmG!h8kh55r``nKA zBdZ6P+$4QthmH!@J6>#C6vt@`*mXd`PD_taYWn84a{ms#ZJk$aTa$_7QVX7#8eOl* z%hlE2Z#brM3fkXus+*7HYpRV+@xJ8Pk`*$Ss~6nfmynli{Tm5(Bg3A6OX`?UFl6>n z)vwstOqo73pi$)Z_wQm`>We(58rrrGRg+{Wdzwvs{mvF5rvJ%?QGCz0)>-84uhRAP zE?jGV;HA$YdL5tUj7=l}4dksR%-VdX!MGJO_SOGDa}wZBF?W6)RrUSW$U^1` zF$2y>-5AbM}q|W##9He<6#4Y;Rv-!S)GQF#LlT^dO+mSbo6Gx=R@D za|WPZ$K3LDeCVGPwz&5_Yi)Wr#)Fx zD|ld%DcHBFf7aZOUb*klE*uFXY`F*;Nj$DAEf9bPTzm$8>59uU2YEV`5XcB8&*qM> z(*H)M63CD5`%3FW`T{+a%^xYkexLMsN$?%fkwEa)Mc{icO-+Lhd)BPMT4WsziKJoufq5Stx2|^G& z_ArZ7=077M5_pRCM9#ii%NsU*!OSYuKmP--{lAAPCNKVtyu99*Y(bDlUxDe1i#}g5 z`2W$oQvOD4E;x)86nwt|KHqry&zACT`rlu)`2R7R^MBm;e_J21czNxwzb-Fhs27MJx%+q^CgPSE>rW2Np`V7_MJ`&cplj4c+q! z2IsGhybr=iu9NVKvrOO0o1Vc90rw)-*9QADOc0<$1pdpA)!Lx-_kO9Am6er8M(g!R zuo_ci`)$Kt_OVI%*yBx3L6_IJZpQre&puBkxGJcr$iJubh;ysj!i4o6Jt6u>A1&VN z_F6H+>G^~XjA9`@53b$0_fI}SckVw4{`A>aoQnKTs>o~lfA%Z?x*31t8Z+%9@f-)j zm=Ts$svocaqfhqp8^hW6elV$DjY%1&{Lkk0goyL$M*>A~!2W7Xt7QIvHn(ecxNj2? zmeKyj*wvW7tz-Y`L-=+p`)V0euf}A29RAPd_VN0w&wpJl-%`H${acuO?k!hX`o+qC-3Hm&18Q}bVoV57E3_=oW6oPADYujJ3&h4flgl&?km&n$vhU=TRqm<;v zS>CV|nL`B~r*-2s6c;;21b==wl&`(~7=b1*O$Oc~Yr_Mhxq2SHUpMtmYYL6tyK~Z( zYd7}!xI$h7VWOKs`v>!BQAVKHQyjpF&nL!E+?vV6DxKOCF;!ttd@R)1Dj zY+9OYV-ac?meFw9Mj^JV*Zw7!iI&y74L$&>woOZws24+P1dgdxIZin0l(%Lg(E4^Y z;>5sRG;;an!HuJ>z-lJZBMl+Tu>y4oxt(7*S=|-#CklJH_Sds1(PQYxVw;&DsPF>1 z*#7p+$9rFoHfH*oE#xB0{ORAGr#H04z&yaFY{MrnMb;&i!LkmlBC?W&0zz%dc4h~3oEv~fbIkOVP<);0 z@@wvC*NWimoXdF3yzrpoMDNk*&~fI6VQ3eUU|ToR{Wl#lm8cK}XR#{DA~`Cn=Z;og z3&6&=Qv;ygo-THgjo-rwl0n=u_*DZl3A>EBSz_^($=3W19xelB!vvX~)oU`m$i9ac zz3eLWy9)gZn5dVaNdK+XnkVGvA{&!){U4Qr@mes$W?}8S7w4Z74qxi}+L>w(rf6Rq zxqnGU&F&l$$(V9m&Z8UZcO=}!;qBZBm1YqB2u9+U2ODNe47eKptB&X#x%8sFgIq~lYZ~;%tTH?rx>d! zK&yYfvyuJuM5)L`T(p9t0J}HM@o$z#T4C3J%I@X01e21aG|cb!Le7<9i^^f?6S4|o z*Uj!LhdF8*Ejf{pWNg>i*jnfgM;nh6yn5N?lIN*=I^cb8c;x$Hf0Hx8%AzA15^`pUI&pAks25g6B6 za&ZqcJ6@E(nqo(`a8Rx^bbk7fCe+ne;_M5<>$Jma&2434?8$CI9q+SwJAC`OZ`ERs z?*%W3p>kQPB9*Do4|ruNg}$eosR%ansz0*B$N^>b`#%q)PHT{%l1`4hjaFZ02r2o| zvSpP@#QFq}PX8AM^VCVq;MaaLdv6pcX(!i=zRL~2#3^2>pzp%mJ{{m2vyMd}Y|U~L z#yn^-A~@QGFV(Aw=tY@ow92;q9-N!O4E5=>u0Ow zOfjLWC8+OF>xVmhkIpfx2F;+a9DMq})BwIxXB;=Syx86t zq8}V`tjuB}Dkns(D^LGYlYE!r-mO_iA;!%ea(*~D2TY4IPG3{;9ndpKwThDM9WSnp zYZXoW9TF`*B#%iC44jPc?w>SqS-tGEm{m?Pv&~F9Y7Q*L9WT5dd#t{F=ch@k1F+fQ zMKqI@ZGSRqA@ufMwWh*Ln+VNtF1>Gm!Cvy0M(%LZ$S;0uHY>dqOjIKN;h?Ahv~LC) znDEl;?K^x$%n_!U&F{hHy8ZubU+sTsL#n4-ExrGot*2xvG;(boSM;AHnG1@jQ#-Ia zsX?A|!Xob6)k7E=#NR&pd>xcCG4&2~M|#5vhuLyU6_;r*sWPFBsDeavE6fMFjk?Q} z%5E{MUlz=Hu8q?dBs+TX$sCzBB8SH~RClM=UyNbXU*}w-1f#vR9zI9Ql)U)s%rVWS zPDE^*p4_wq0s50gylgpFY=1@>#2UC=xpp*WoV>6(dHlFTJ?kP74`9})zww<~k_z9J)+ zi{JEZ@NhxaY})E#bG}yM?iTq6m_bnoq;!Z5%+5SjyJ^ts{`G<2Rf%}VR9$tTt5CMmnu0m zk4oP*^salYN@R@f?yE>&b8^{mmn*e_ypNWdudD`~3-5r9gEUW|MR0VNB+LQab|5`) zH*iI{g|}VDWf0zC7mc-FTr|-ENzEghLaZt?=t|B{sR@99&4MiOtME2nzc<{VH&RhB z@7J>d7x#kRTt?PC6=d`PGedNQR^h0T%5yL8MyV|_)RU3!@A~< zX?XPQxwk4Oom$RM*PD|&xGxamnKSUMK?2&x9cqn_1gBO!m7jHBa+}xc#uNb0ix015fCN5$2UNgRSm)^cvx*c@nC9xs( z53ZN+d(V}Y&Dd_0KC7J79IF3%bt>@Rszk#t5&bE0Q{yE@V|^Un0FHiZx=e0ks7`JXQRJE$TB_ZZg34kOAo{)l&zavO&0|CG^QDEk@wL0Qvn>K9 zC(sl*U|B8eLa#C3Zu)J>6a2y@2N?)lIW@8H483=9A)h_cTGt=2YLre7>LXT&JNZNQ zIQbd#WflYc0$3?<7@5YtS+KQ$0&dR9a|QjpbpxxoFJMV}9!AGEP#OX%tlzHHxK6A15GP*2j7+FYcaZC}ud3CfJuXTwvGzO3Q>Qsao@IhuiLRKGpngKm@M3 zzcJ7?`b()*+pH8%xqX~PCC%A8oh?9rD-m9bFRsKXiJmqvhHY)ui=(8%d;xiJQeV?` zUnu*vP(}4$b<^+Mb&zE(boymu?EHIVvr|Gln6Gu$cgzRDRW z``prAkSUr}7>dB~Hq|lxk@eo(H2HLNmONms;xaBl9#wa3Hjw z%hq*cr*GDK!J;~e!?=V~F?>1e*&Ghj$6 z|9a{WImv-w@}I|ZNfz)aucXoi(z{~&k~LM&W1bl@uu1TNOI7z1>wAWI{1Ym;SBMpaSC7?3R1bxvpnYnuhdND?^PzRsp9 z_CMbt2&csy+ue`^m6Am=-(MO#vzcW*30r$nhdZx3C7dpVL6QU!x4lRl%bbbsEaxrf zqQkx{*>A#7(sTb4lwI9WO*?MQdm0R?hTRW-ay7vVCvZRMl_2&PM^X5JgVOVh2 z^^Gp(`*4;Sjt5hgzbEoLwk*jEu<4>^3VIOD!ZzAI#UI?Lz(+! zGma2ua58q~JY4X^AOLhwsUmOM2;=ai)(B;vnKu_`ja*N$%Q~!A$=aVx%bcgBGaH6J z#bw^8MzDjhRu)2VX&Da|=92DvCeN>zo@;I3vhh#NO*(mOn-vr)6i5w>JX!Foz8w8) zuRuy)a(K=8&?%TzHe{${2wRCy$vdg{t3gK9wzx@&J+i*s0}V*j5IIcG_6w1r6WdQ) z#%~gI`~;In|8c8h`zBss_W~huqye;zuKW3n(dU8?j~--JXQam0{roOn@$=K^%kae^ zj*6pdkZ0a1R34xwL(EM+ONMw-!?Tx-md8aInQT@`yRTb$g$kYLT&Ujg>6aaG z)Q0eyB{P_QFzWRxKHuWb`btp}hT2o;mSgqr1`rDYwL{CF}NtTA{Q@F6_j^eO-HNFmcT zrp&`%AF+3K()MDH7Qg!0Z2xi#`mtt0(-0IO8VI&9jq)26v`QRx(K0I0iqXu>zM$2Z z!WT5*UR|Kc*!`w4q26w6)=D+|cM=ub+SncL+EFnP<56~q3=y4XoJzN1+5Cf{w+jtw z|BJo142nC-`af}Z2@o^^LI-!ZhTsX31PR^{EV#S71R8=RxRc=SuE832Xk3H4?=SPr z|Cy@Yc`;R6wJ)~jHBeldbI(2UJ)g6;VB2=~u=g#J<6-|!lA^=2(FDaJz7~ZiBXF(5 z_1VR{9YII2x`<|yak`_Nm}}WFaWo-!{C*-eYv<9bW(^ON^wfG6{@*JLHdh-;1m(zI zeHyT}vogs~@#kMp20|OgYJ-Gg4^$tQrh3l>zQ3y-KhU1VGKN^JI1`0QBRqWiS@7cQ zjnFxyyNvIL!WmYBWZBS*N4qz0PV-9&q;g_`drkxQi>YePO?1=2J)hE`uXC1KC2t1B z?KMI5S(>&I`j{%Tv8FR%bmGfs431*GsRNzXN*ngd@D!VV6e!vG-EX1#Y}oUS>SFZo zyl>%tW;I?$U$Ng$+cC3=7j-LdX76Jts<6*wZ=4r?Z)#)(>r+UqB9?KyXtsOPf+IwM zcN*d2dV($iN&(}YQtmm%PU%XhUFD2N+Y#j37%$ExPGd~z4EIg-^n0RSxD%8Pq6#xm zqur{lMW`(kKD%|v&LAum%(e|ZM`dhO^D0P}Zta~yLX5L` z7yrDC`BK;zh%W_ddJ^p z1EqwCdvM#!tI~#ArFpRj>8rs2r)BM==#z5JnaxHUA1`oGYp_`8K6whX+DNq9`6?O! zM%gks*Vk1&P9G(g0d#m$mU~_l`|8&_a<*@0=M};grfjK+1noUq! zItWzOtci;G@rD#@?RlUUWLnqmgt7Gszr^`nJZa-mMU?Wd0W4GvLbJn)ySq0H`A=3x zvN%m5!$z}Mky~Lo(I+$j<^(1#@Egw3i@QG-O2gYF7tIR;wuXzCb=Ftpf11d*09Jfl z6K|d3QBRaMNN}dycbr@;v>0VyjoD{l-+kq+ZCU9oUs%3i%wW5W{()GG$RzgEY+Heb zKl{OKkA)*eI>y|l9Ilnz^w?OsRjI^&X|vrV<*zl^5nzuj66ms5ZQ(x#d1hJjWvcnX z23c%267nn&^0=;>bPlNr1)h-MW&ul=dE?5u1421kV@=oV(W=EDR5R#=C!XHa&zz17 zL6&67BJ(q!PQ}UR^8F=I#$Dc5Tqp z>D1YC6g>3OOkPD%rE}f&x+vqACWcPw5Xoogi@jUmFWD>*Aa2zU6l;MW`dGIXh^nYE zr7r(cm(|-u3spyKbZUKN?PAq>0mY*%df=6~6ohB%!M~l^=#^ev6|-Doq26qTt7rc} zyDLt6H?g(o_!ds-spK#bjW2o)ZQGlaChqn0w2wev5aKsWn-DG$OYn4fvw%N`#6#5a zr7Yxbu%w-S?-uzI4r{yWRO0wPZ4JBfMNiB8+P=qCiRFLYa6Rs__0uf<#K&dNF1RA_ zrvuA@n|W3KvK6X`V7!aolV66X+}HDY6YmzNV|CUv0o%t{-^%4;$Zev9Zb3J@)Oqav z(;5Y0-L#BK2vz!A$RCaxqFbDEG(aGgo9#4LIN#|*gD2rp6&&$V;PfUNHI|3JwyY*H zqfguPisLj0B|>s#GSMvA!nRm1{7;%dfl0q&7*f^O^2&AQrffV@%Hax@lm0IP$yr{& zYM#4IVUU8`)?5LT>3K>8(dWD9v}UzKgnemvtB|MqQ2dS;B!9KxC7fCc{vL<3-?yWqoHB?_o zSw08L3E*n+ZB@w8oPMSmn#6X1k}-*2CeJ}a+e{yg)qmzhi!4tFt$a^~t^Yamh_D`}L9?w{g?dw$_>b>TF6(F5#-MQL4?c_UKa^I*cV9Vg7vnV!DMNcu zz{IQOQNz1qp9LG{!rxfaOt9;J8rJDNq!xVaQpFdN(tW@~)CXBRqbpNVo0Kc2e#gzSyDx>gvOH z9&U}v=jIoy$R+v)RkgB6XwF0$rHjj@(Fc2$PNG2NHruD`3$9(M)vxt9$0b*%3)SG( znC{pELDxf3#|^GNkLAj*LZ7iCuUK9qim|kQgjvgc7hcW%<{g=79Q~vjT&nEnRfxqx zVwiwlfzm~&aa8`ftCgcveP_q>@LknRh0qEl#P8}ojWl3pG#V6^>atpI zFkv*l#I5=K*&%-Xr|Ut};2rU#1A)tqIH&VA6Up2wOnlLuyB=m*Ck9n3q{FN5@2j(Y zI8`{R8T(?nC)SOL9fj~*(e*G2HFN3XrB%wQ#9=LaS&^b@C5Q@#JB`l)k;5|17p>0* zXpSty5bUL&oBvjNy7eG zDX{y1ga=&3D`$RJ9sSt*jnIv$UDU6c?9LT2qJCLPUrNYbR(1>Ybhb&wKtleHCiW+p z!>mKO3`%Rdd7UNpOxyL+;{vl#@Zkg=i+4W`Q&3F#5v@drbOv+Yohm%ZvUA$oc5qNk+S-2sQ7V> zUfCI(`e>_kokj(Dj=`eCrPgDlK>ofZ-YiP<9$N=z4sxT}8Dm4R3z@NQn+%7z&(a@x` zJ`|x}g21bM$kH`M5sSU(H*q~GK?Q9xb-E#91t#ouG>#c#K3q1hl*>MX)`=WoVtZ3j ziwR|_0)tI!=1g(;ThCCdejjE|Ylzb5xfk4UR+O(y-=L zX`ADu%BV!JPzgRVPaVGY&RtMXQCZHrF%$ZzuRbB$>M?$=)#;HQmR*i+suit({*Z<0 zO1$%Kvn1z?vZfb-YBg~WH}e}OZoN>(uG|wl^n)d3G_;KBKqP(FX0`7PgDQ~M{l29pTZx80w5ytlaniSXdG$wJFybJNqeJ#@%-}|x#gSjv-*Re*&b!|9 zarKjSWg-mXHsC28{JvY+ak($Q$34}$M`6Q12>Cu}yAd1obtIE!acq@j9_)bX*z^>S zw=6P)gqkQ9nvPM@`xF_A>(8fW%R(WGRq8VIu0y^k86LG-xb?p=9W7 z+FH2t{HOy;H;JnFI~|lDrCb|HK?CoH8k7Ut3%Zg=)0}yMr&UpvTz0H4O#}?WAL48@ zPYle(f=Wmjth(_TzwT$!J(Cc8Hj$YL6&<54Ec6#xa-T#))l$}4#x@yq-uW2WjLdJd zP=?F3=nz%Dc20y7kU_$QafZ{+Iuc;E{>s6XWKB)7~KC#J3o*$ssvN7 z;DIj9!+lZr99LW1pK5(i->DfI8swf_Dt!mr7FAS^5gA3JNlwk8zQu}jq2l7|nPM}v z)4Mm!1dF&H6L?!6R4(BwikxfTieSVILg?r%ItgV5N(03e?(1&_>U{m3f*Pa-MCO&) z(9zi@N-l`Qly8J@b%?Z@S0 zXo7{r!giJ0e+%Ge<6t?7#HvtkBp!G6g^d~dz{qw4Q)Ra$eIHI}`8{`pv73i0q z{Jq4$@A6Hn1-W>y{Ti(^guh^cxs^Z9x<9Lu4+lFhL2t z%Q=eU_nVF=NhEejpGgSHZ#lQO%czaC7PJi1hg|ZHtQznS1*8k_f`qD#h4o5X+eWYK z*Mr%y+`}Z!i9e^l9dB>!Vbl!(PB%mmQh=@wjlK zI+Dn)x~<%i3(Jm@XQ@FZm~`XtIsYO|gqIzxtkVTP9_|6F##j13lt%2XIaTrng8cQz z)%DR(!JddlrrG74Lt%92`z|MeEm)DePF+VM5o0I7y%`U1pVe0rwk0%(C(c6bRN_iw zZ<>4Vw)A4?RrxcXuQ)|c7&}KiR``NAzZ4(*5@fn}ALjmW?^8sW9^fOVUQc*->^^LF z0;|EALr%$IFY(m0N#|P?=1+z8Mqb39&y#B7TST66S{h9nj~~(-1dt>NziUV(i(?9Y zP9oU=+Qf-C(beM*L=tNvP7`vKlv)cMi_Gmxu&?f1<{Rlc<7|&Lv&Y$^GM+*`al9d= zW}J5r`ohS037=4&K~=M^0X!5#Cgx|LQHOsc$8vj|21av9qiGMyU}DOrllw^z2M;Bb$P3klWH->@{qt+R`&KtT&rjrj?9}VjZ)5^OZbWN6|J`=NjfS7tE zPar#J*Y?Z6GdGD{Wm7534)K}hGU?2XFqcuCD>g2G#&lEJ_CsQ2WV0*vp6j{wvfJ6P zo!^b!UP-M3EalZg}Pzh%B{*anJYg1jk<$dN(+GoClGX z^i4R&iDY304#(Nfx9f&dxEMcl=A z3ctSQBB$BPeHVXi$bdnC&z-U4F>cGV*}8Bv0k?V4=qi>T*gSdRmrGp)mKWaGQC5C^ z?#;k1)W}`mjqOgsji|9?BNCZz1QuO$;(YUTnwSW zmQ4OOz$&j6dhC=J#knqw1KJHi9P}rh=-K^jiMG^u)&IQ3XQtf0N=g%i!Sk%W-ETu? z6bnthSz}AGoh_gE1#^nJDHkr5u1uT{tsN46T~6u4iY|%m+lahXK;5w`_tV$S#*^&sJ&1vSFIiV};ZLgK`nqvkYh{&+_y%mL@EaCr>N0 zVDPdWpl?Gipt|VaK%pfs7*rxmR8`H>NxlFe93*!pDu}LsM@PL0J%G+&FFQ1B9Ms

-QF#5W&`?d- zo+MYl*1p|~C3K;z*(3k@L>xN{zwmZ*#NzW^1YYU@5DaEOZ-ANun)7 z5;i!o1`CE`Wd!K{%ewMXTYJzBIz(H2et_!BDts0&z3K3eALv^5yJmrgU zi5UX4Pu>BCh~CGZPL^bu8$VctP9f zj*WBDJ?lcgR2L{}V~`0Monqboy2c(LJI4)pRo{Oms-cI(XsOESeXCA{wvb*u1Dm$hYWJJvKwlftJb3!GYR64QA$839?dMIjK_QqZtLTkE1hYa<2 z_ER*5KK+|33pRi!V31UZ^d}<^$##4VZ|IIRP6mT(Yqi_3VG5GO{lLFG@f0Qi_esf@ z@3^EZ4CiZ?-7 zkoXBW?L3CP2d56@bMra;hgneQV_t5~EWqRu`qGUk3-OIZ(daPl-g;tChWW7u zy1rQ^o7k?;q+Cf^uQy{ z7yp3RX`*rsabAR+%X|*m=X@F)a#XZ0{FdaagE{3Wy9`Br$1(WTrXO6lCT}$C<=*m1 zi={E{Zx-*M`=4#L)^?#+tmod@TB|g-M1>#98;f$W9cVPX^H|uB7%f!&HdpqvI}EU% z^lRUD(%30*GC+Ahq6Q*q{jyMEU!``L%+OCAv3`MuTKMO@(hgz~x6cdGx@!r-McM&p zzvlvl5e==o$O1Mkc~q2ZmbGL1cI3im>yXB6EmV#O6yyg=>)gW+qR#(3l~}LlS%a2` zKWMB-y}Y|;8PNPjKXR{KPMm!GRbnm~GLwSwEZ0CL-HeAkNAMAQ%FZjDBYYj!zyPVU zTyiv2Hx;W${}C;6M;sC~3Y^R;1{liH%;v=eOx!Ux|6v;Wt14C4MA{Yu5RKfh!mG%+Q#wk9|SEn?8#0J3#Vy~`b%)^nLc7Y&aa-Q@Wh#7yLW`!Vk>Nw*l zAW-rVdBsyUmafopI6&2yv4W>nPmn z8fZ6KGdV{i$rRT=#wCImiME`4+2wuXRsNdY&aIG+2;nqZ58Hyhvl~RWvs7AtNb{e@ zR~ddc0acWp&rut6pAxFdltnuj9c0~NY1#H~THE%HDQ^APLY%|FSN< zKJTN|I{@<$3k|`j;^fkKu3@3oiynB+of(1kzDY1`K?>ML?6VX`837)`X`jY4Cm8$P z&bQb_i;Jb$mnn&DVuSv=)R3l*=jA4CCzy+x#QMEU6`RVhV8S~3_d~o~#p8^0)UjUi z%H*>Nm8nJL_Xlzk91M1GPGseVEMrL`A<-1f`W-dL4x4qU9DH-~a!@?oNmbui3@eDG&! zQRMHe#W*THCo)g1HZg0L9LKW3@vbdWh?fO;N=&7LSNUdvy$2Z(RZMS+9XdYB8@!00 z@FLbz`wASa>~oXG?(H~q2pVdSdEi>2b?k*|E0Db`gq<@u08*M=OWD$=KDKS!mAakqrCW=ys{T7iWMlON`nyJbzRA|CWi>HQWwmP`O-C|^s`=E}uN1rya z=-R1k_;FZo(STZ2e#I_+J|ST&JL%GWs1MI9rP=YP8_zhzNdX69Zk1 zFRzndP@@EsPWj`X(q8@0|BC1=Gs>#z^qk=Uh@d5(JByL?Bhx%`+R<5n62ags_w0r@ z6iT}-e}F9cIDL(Pe;~P;C%QDx3&_cq8m6h=g-g?3{6Xk{yS4`UBg3T)khIB<3B@5* z8=2kDBn|~A3)l(p0P`aF8<6%S`YyJJWwnDPO0@7U+Vtn&b1-Qa3zFku)}wruCHzbv zNU_$9pl~93m&8-J}t80 zcdkXny{i^SJdqP3FBuT&OxD$9zYP&Q4G$`s=q1GpQf$tlVF>DANga{cFdn6YCaPU% zXrCAjTO(9LPw}KbzK-;;jIZAvs`Cvw(w(8_L}E|;YhLU?T<@$H-^vDlS$)zvi-~{k~&Knf;*T>_T5jU`@qO%VGjeoAJ#VKE+WT z>~}t^!Q~qsGg(z}F6eMI=pzmdVa6po1@i!e4`UL39k^-|KXK8_kHFyq=V8XJ55*Kd zjUL6c(2TnB_osR#vdWRd>JM~m#9*2})4SX5n=e(lfe#wA@RTX*15v5u0;>ajW@=Mj z5-^@3&rdJANp|IJeuF?HkRpSA9n!P~L9p0;VjdXLv;eI^5a--+9w;T~22v%qCArQ_ z`yaCmFxF-DiCL6)qM_9E7|s&CX{`9oKN=wDXg@d31hJ6m2c}g(NfXi9?Z1kFM>9*q zEMt9}PnZ&{40|7;J4nAr< zml1z$8;Ddw;P{nogZi8qmls69Ecz@`5-{5C53D!n*8ZjVlK?_dErZ4HJ z#73X17xaXG^O2`x2YMub&Ny0+@7(p8^fCxh=HJCq(80j56dR+ZU!;YWVB?#M)|w|@ z>AZrdO;wSD^K%jVfCq(BXU_SSI+o@h&#^GD{SP3?RylhgR;&|dm2LLCvFMdN(hDWY zM3Ru?F%UbMw=~twn@)iH2x-JSNjc(DROWEXe24E;@6NVMdf9(x)L34<&zl85#EUoA zQ=N=bV=ai{j*{5;R$m`jdTb!??(2B#)W#}Hf&O&6U*;oRvugdZRl2t?!;+lwD0%Sx z+UeU=W0TmJb#js&z_8bUNCU(*@6fp18H@VkO&+){t$Z>R(gFFJgge+QUC> zv%tPdORD_BTt4+h+wZ5g4Tv?~U-n&D;&RqM-u!spyP`>*H zJ`o7&=k)=Crhp+nl!@S+hm*dRg0p93XsGF$zXdZ`DbT7Tmha1!iM;o8$lkxZq!#{d zQf3YHXiP>!1zJvp2AWOCrD0STP#?28+^-wYI7Kh6@fVTxPsI<$G z)c2>*DLJslFyDma@PkHmnuFMuig9Ul`90i)SabG-$qB-|{ObcvPmEzsdit>$^A?TA`HUmt?ZfrMTfV9;aWcOQO+ilYiE$x;7$MEs&Dct+Y#DaXMzsY zrsO!7);cvMvh_u@SHShk+K%w_?ak=km6)7(g_6LxvAO%eV1*j%_0e^0bum@$H+$UMYD5>U*7c>0 zmS)@4{W8=#-$}HVLxCuNyB(j=?Gzrf(x<4@H%n!J3kN<8sdJOy4KPs2xF(zb8^l7q z&2kYJL=7dYRoYUxHOdEF3PyY{daBkx8Er49Xp4w?flCHo|~_k4CZ zbVIEbj;YqLzVPTh-BV}yB~GZ8LFPPXvRD<xo-#`JI@_B{o=0eoW349CEx1KI|P+5y|R$- zkRmvDDvi!3_g1m?LKGso^_baJ_#t0r)74{}brg81xyNdU!=oW15?<-=w1R)D+c^GG zbrToMP{`TsEazT|J(!w>WI#vr(z?@woSGKnusmog@TiivP`PT^l}w6*)p|@J zz(`9+f4ylYt)>vK8Bb!y(OM`1Epu1Q zR-DJqnh!>F+e~3{d*;r*D>WzeW2^y5T1-U$-aHUVb1tIf>nOV+lxTW(ew5HTci-b` zcfaqeSEs%`C#xUQJF0n`*Jtv`oo~jFVl`J?7Riq5D=x*4v-4Xsk_I zEa))=WJw@CzRz3`(GdxON|O6yvp05^!PL;k$S_y}&DWjUNFUsG6zP;8W!eudS>B!P zDK2VgsGPN9aHLMz3><6WOU;?UjPcryFRBQ6m7u_qX@b*&`X$9ZY|2dli$h@;jb0Zmdyi@|}kvBQw0lpJ-q)2%YBqRUCwyqBF^f zUYT{fB#kCjI4_Jc0K`U>`Mhi@(8a|-W*chYkP zJU{9YUZq{MbG2fN4SF9gx?8|sJB6*OlI_}(qdhqFF#3sya>_2De!^OO!INo|u=~(o9S!;?i@TxJrUF_T-fNDcPuRu!H@2&suxKUhyb1a(zRvsh`!+s>7 zt^FuTPToIbzanZy;Ul&~%WuyB!Zpt!th)zmV$9)-(kWR`VEd@bp$V~JZ|8XSF}pQ> zl}!uaqG{%6m9eIZaLV*TX)tPzp-k#08cmQApD`h}u)jzE@2=JSlFJ+bf>w9dpX3fg zQoA~8y(nRI%)pAJ{C6adp#7SQ_{ZioCmU=okR*LErJz7sVZXE7_<`4c^_C}|lB$6I z{WG#q|LYz?9>IWRiyb}JyJf|tOa!{~s4BY0TOShvPgfo`B`$dP|E%0Psl z6Vj<#rQ(mw^8{{6mmq=|<-~nXzeDDcz|K(}Ho7R_UQx=Gw$;36=KkbT`F~gdAT}s? zqmd=EPo3b+YnjaCSg=KSpV%J%FCZ|x*}@deDVd<=7g)M35t#$RM_3beX6AMWnsxcU zZcl`05jx{>)Uu=MeZAb!DGv9>IQ2efaLb0CJ0(f6b`A488%H>18=t{*l6uClHf%T`lv!?unayONs=VLs6o#}n9fAHF$4TQtKN%AA9PVRAkFG-Vg$J;a8;~e$v z<=!ie{ z-^V$J!XtfwMR@3VN{Nh9Iw%7tkP{zUwG?_k?q^(~8=tDbBbhRdyxAz_ZyIss+*V=* z$Sv_)Y%==wxRiWZel&^WYdbU9JRw49J`B91-S_WwH5-~N%Wx^ZfW(pA79KU-Rs#JNph_Fgl8YOmQ`>br z0%Cc;)zL9!@wv$$?4Qa?AOO4TYTCUd4g4t#&+;Whf2eQWzP#zCjkzI~T);z?9~RoV z2l%hUVGPU$Jv=y4ft?dGoS*0i8j5gs%r>dKU$j_%#lt@;ywE>=J4z^P1m=F#DJ`Lx z-ck45)fq`)P|UdZ7N#T!?&CB@3i@CUCx)~qb3{``sOv291OI|Zn_+gs3D zQuuGb@g8ekV19HDM8XKL@sBy9oNna>UGF0k1e2pU9aMM7Li1h;i_U9*nC)_)L4NMR zpNXf*LHY~5yN0%HD zapgc6qa9iF{*?>_+joEFdc=_?|rr^LhLb2mj2|7{wiPqKMJdQEVnz5d;7Pf3;$>6 z1jLL7B-tn2zbX`tze+j*<_~!}W)cHdE3Wjfq_>ZG?yaIdX}=dxjeZ@7LWSfJoKz&@ zYJZsj#QNqjwvyRIXauO@DtK`~2fNZkhu#_ooO`X8oxbp^{5dFu=2$6YWl@-PIlcM( z^o>@A6f6oaw`+(KF6t#i&eW zPd%uQE(D<<$-joZ+JE*kMbcD8!eBgW0#IHQd7w4`tskHmNiD~(7kZfjbB|9gMK4++ zMOy10BY#5}0zoDoXF_zG`Tgv<)&)`sb7^EFZGsCXdes4=x8U9-Iyaq$gX10fap-Bx zr@QM%J+jb9QTokd=xKC0r_!^{4{g{ZgyI^A*xXxT1Fk{gchC=kIGLHQ*hPV=Wnos7 zlSjW@+PM4sJfN??bbfsC_mUwrD!fK791PHH{hQ)Y=XFs66KNdYfTr36wD92N3{%?_ z`&AltHv$EZWuSQOP&vVgO|+9UyJWVbEd*O)qW3C*jm0tWi{6LSWSxpeCr4o|RadhKvtdvY(=VMCWX$mF z=~1^{k~!n$D7Cy#P)61YWqT;=V$D5zy(+mydLvbUxZTe}9@#x-luLeuyo5JS6wgse zNvATJ!0zunDo_4@&Q`ZLKKb**0x%g_Fj<*;q=Sl4 ztXngnwV7H$tzYF>qHX%^9e?U}!=+Bukk4HqrP{Wp(}Q`!VlUod3fog{-p@~t zB2|Zt`@KFHMsC#+SR6h82oKI9lW&d7?=N`5E$@>C(k?q?b+4KteCG2c&&}^Q3mc#&{YSNlqRA{t> zR|U*v{dmszm_GTLTaQth4)YX$ui*y!S#pf{?d-noAM<_f^AoxlKzPE+PLS6Fh?8h3umt>4v00M<)_#X&BOX# z0ELI-rze{^?z5#jIEAq~SY%#Y#eMMF9*ziSaN_>?lsBZIW4pO)x+YG!iO+qVWZ-4N zzx>(wgJnTlb3!wSb++N6Wl+&pHW)VMe3FpLZ#cV->r2QllPZ)OT$OqiH8#gu=` z^u+U!o0OX>Lf`MmD}h}!&^F=vvLHk^+8JZONENaBU|~@lbWiT#Jvlig>)+8qEs> zskW+~EuG{>=lvibTl=oTDOxATD!4D)>t>VagLl;O~=V68C!|3H}R{tJ1+Mus42SD59%1OM&b&1b%4qFMoOq2Q_yc&HCK7fy7H1hfr zxs&;1y@E2VKHc_@d!SI6HRY9q!s9F0iWO6mA$zkTk^-G!(mA&an$)twPaE9nzXIK( zZI;`P_r6aNTL$v?@9@NiSas{CwfHjZq9rc1c#LNCZbS#n{zdC)r$654E*Q#7s^bXj zbyQK|y*sQ@0Yi=|x#AeYAY&G zJwB2Sw?VIMDIG#TStR)k&rJQ)TL4DcOj={kg#N`F;;b@-@vDeEyah9iSWe7a0hb>& zxFb{{lbgU0Uc|jYG$H>_rcs{EKafSX|3w!43qs2a{6%VhMSJ|8v?OR8sxbUrgZEVW&)FbNtv}oL={Tj?*J$ z`Uhi>^S_M2f4Tqvdi{Tep#SUs|LgVtjW77G*Z)`e{lB5)f61QzhSd81Ig~Y%0T(Or z={WGah->N1<-*aBP16*g^Zq0krA<8~yHeBw4YdR&0v}4fN&i{n$aZ#49CKIf^LS%6 zQD{$mbhTTMVOn}L-_zFjZG*7>th8x(sY}ic(-SgY^_L5JK}?a@*NUD_l7`Tl@sZhN zS`WaAj>%Gzig{75G~-L~()!+q-vH#z7Oyg$y68FA4DV{P^lu25Af&wIRyOVG{#3Vt zpER+`)FR2XB`|d-H^slX*`~@0`a)>R%`)rRsvPZi;Tl&yTuOf41*;ZkWz3t483Uly zYT0ZCP=mgurstUv(fOY;LEq8b0jHHB4t^xP{glx0@OXEG=4&^%moM>n-E|MZOw-}c z)1UgRnooL{-GAO|E_*~iJskON0ct=lbBSM?Wj6e?hJnR}{|DBY-j-s!XZCl#`sq6; zE$T=iZH4mMaD?jYX4@dSto3%ls&2_89~>wqDz*iv9m>2khAbwVG~PxGywZ61iq@lA zqRmIlB;h1@SXX+q+SGfMHmq}l#>@}bqBb@7jGQR3W0ty zc^=gy&6ypww47!?^xquu!+f4TM8{jsMHd!*<9GSNHr#CBf}+@)iI4zO6-oAZy2dp9 zN!qDb*LQgF4n`yPHpiE0WBC^puD}BlJPh(pxiTH*=d-i{C?1dbu+LM!rt<$W(_b=T zZh^ME2>*wf?#w)y{;e_Scjl|4eEC~@l!%46|M6T=7{kaUo-BaBGo2K>@_;nL)yG~| zBC-Lb(uo{1jsgghIY{|<-0G5*e>m3hkvx(Lv z?L6~vGVE4yOsFI(LGt6Y&enYC(okQ9R~(_IwWSv4`Q?HUg@Ur%3fNJ7+wy^P+3SH% z1+s3if(iV@8pfO7#u>gR!4eHGSXXz5WRhpFTA~{56*h8Eurln08#W{DRk}Wwa6;{47j1ao#l7W>9mgM>8$ZqM6{-dQznPk zeQan|oX1Uq=J2QO+RJ;~2Iq1M`+?t&JZi!e@NZ)Ngw*G#U4Me0>P4FSI*&r*6Lqe( zRiV@br6974QqqL{$}mme9LTDG#o{cL)2RPexpc~RWPCxo(GVYA$_|C)tmLa5^`Ac^MhE+HHnBH&4#K$`dnq+uAR)CRxcqK^1119$2cbCl3K)l z`H;$?Nocp**UXdQ;1TgF9=}qOPeN9T@)^llU(tvBWDPTbf|7AQ^kEr6cSw$SzvN?U zU$?0wAu-yG*~&_X>1iF zkyX>kWh$OsPIoF-?c&;l6Uux`jV^pT*6Q?@am(&kdX6V12hFyW=}RtW!AbhuB@5{p zd{i@CoiSXaPcy(yYn?i-C+{258J(4*v-w&sfy?F>$IAoiMQ^WY1d`3tz1ah4PxE_z zGo_4Z8j$|R?^P!+@Y;!$8|n;>9AEp{x|_e1@np5uV^C!}T#`VYm+vHM6s_pZEWMh(TUiiBd=R+fi)D?^6kpn~6sk zP;Rgskvx$<2-PKS{F8Is^Zpkzya{*Ov0u9x5(WOXebK{!>R+$zsKa0Co4p29d z-PJ7|bOG-yMEXNsrb|=f-J=c@C_6F*=Tcsi)D9N&}qvDYo0j1X13 zPj@V$MK69Yw_Gkx+*kd6F|GK8r3}gKILis!7^jzJsky*CEn=LB)r&H4`6Z0Ro5yBDej6x?u&j$M?Mkj#CL+$vM&7HD3-smnTe=uhI|m zz{IxEMJ#(oOf47MCX^F>aDUW*1CicEY%@ULBB`Aw^Ml$?OPMm9vY&`7t$gnQ>LWk) zGxj0XerW~o{Y=F0c|0&aLt|jn>3hA>{jCe86H1cWWy%P-AIqxKt} z_>&BtwK!-_X6nZ)qum6VMB+=F7FK_XIbd(;rkc8!=V;TM_s1AArQ*44<&_RqrY_|) zXNue{mm2+Dkba2je!!=hf_K0H!w}!o;}O+d!@i|NwcPpE18xf~VdT0l(EPAxsTpYH zcqp_sEV5^}XsZVgG@GF4i0lPnw$-=V zRx!w-wiUi@t@+{9ecC4bPij}uCr7J~k>An>Ev7!rXx&LHmGTtS3if=gTDWO9thmD! zd$=qx$e?_;@25~_pmX!3E%5mZ4_HSg?m-v-G=@dUl2lgYlxI#_fXM3g`fzzuh{yJ= zM78t5yelUF=8A6plh%ZW3cWJ?2pxWaFj-$VR>T|~d?>0KTX;6PV61Yq#ECbtSo>4W z28M=<6IE0#t=HD(ZM&;^Xj_+F4`9RJdg~+^i3DFAOp^$l?;V_p6sfZ~Dn0oKV;t%3 zCvg5QvrMua_QAd4OHqdpEzBRKa~Tg+3p{F5d(Fpda@Ru1J+}4oVec$8G3Cg;@6IoK z{rRhOc!tY$5?znxCSb_qF3son3=-5k#z1i6{p33Pdrd`xC+A7`(+$w#;9Kuqq})oF zQTpn)#PM1^;i>g;E9D=Y)~7#ymILg>NtYv+^oj>lcE0c!y;IPEkIZpJ#68Jl+*2^P(}Ht4)h{7;V0KI+{2@T7PV% zU7#zt!5m!YRk@EPL!1=)A3&?{Z|%^6%g3taj*8h4EFB-OP+k#UlPj`!k)2pluj9pj z(#pX_ULA=0LM49^gDx(I@uqJsx(-}%*e=#qTG~o^#-b;H(pfwkVpC9=0sfrzKKIKm z*TT&rru5Y0ExEy7s4IK@P8{!(ONhb`-$x(`l&A3rxetvjU+#wix&X{~&3ZL#sCvQY zo_5t(s(L~l8TCz~b#`QeN2w(Ya)F{;?Mivtp>SADYk7>b=cye`U=aG)?c8J z9y4@L2^ZS6wj@WW4JZ$w6Y5&kizO9ea7IZ|u+I^Gmh}-n?A!*=BD>iCS z*v1lP*OwlY^Opz#kT2(tm}F<`TTe%i*{^qSS={RC+gD%a~*^Hg;w9 zm1Mlx@c&}(t%Krfv%OK=B?Na1?jcBkMnVD!9talPNpP3O34!1VL4p%JxND=q-QC@~ z@dg_CcHT2*&U{sK=gfciR^6%E)wQd;p6dPV(bdQ1Sc&g+bM;*RaTfd6k#4~9U+E>nSs9% z-{Wx===)QaMmW$-UiZhAcL?;O$AhkV#QQ)zBtGz^dFL;coW4%by+A%67a4-<^%-zb|Cl}QZhfTnp^wCY@Xp?RE!6!~NRGFw^Ng2Ojsnw*ewvbj(mO?TfqqJgP}v##a}PGQ%G! zB77mW9Q}7d>}*&+Pj8Ot`uK>uf`r7kRQ9%4+=uw8^k;t!!j`eM9k=^SY(q1QgGu#~ zs+z&0NiFhktde?eSeg0*aEy>QvmT#M+3v+$8=v9_utzun6B}W0sDXxBPr@>{u(aVw4>FZF zrLr+WV$b#TkBewGy~I=OOML7tax?F%FX?mZM(=Szn8Y9C9Vffd|Fr#4`ut@vjD6i5J>G?OA!jR^QOqs%M-p^^AIq2jE57;)a!$REn^U0-Y+}y zC(X~Y&sJk}JL{vD(y;JatV*tN^6ie69{z{sbP1Gs2C)JkrY?4kpQ>`Y(F^%wa;9uG zqC)3eCQlev!o=%iVA9aFutv#V(D7(zORI{uQQUS{Y5nF(6I5ym;{I^LmVRE=;FJfV zIYm`6{>1hnI9ge}I^P-A_hnCRciw37`Lb+>bdx6>BC~^Pm8&KH%s!=@=2Rpa`04cztAsIyly53}V$F*FsP)D+Z zQA08C)7Y@}qOFa)R7crxnS;pk>jpguMlvQE?fQA|NdE`96sXi**T>jR?fj^E8^n@Pf`s1IW@PFV?3EvvdzM)BbJRU&m63Raw-qJqd!ok-@gB(bGsQ` z`(&K6LALSOgyk^kF|v$Nx?4z{06gjpmZc$bvX_rKP`fnUb}*@M1x)fDV@dSte&TF^ z)&_KGStIkMbP1smdl@y0(_)jP1l&tz9kzQgSg1r_k*W2(dOo?OuG;4U6qHx9u)|HD zJ@o!c1`c<-JDW7Yzk$953!sPN?{K7IAOFCSmc==MHZd?dn=>0}E}r0lT&drYagl7s zvCUHu(vI-wiDq z>N`<$zMfpH+*>F*0LMbNqB|+Vc(Ud42}PNU@(DzAof==myB{o_5caX?>txjRd8fr#ht==)3tp@xQrq+2--i z>r9HTtJ06PG=_tB@qQC-_Oplj{k-Z@5^?focuP&97 z4@zgC8qk@3bAo>fHGQhGA4tBbQ{=cJ?!ySVR$t*i$+n+X@JQ2Ay26e#%aICzb0}g` zDvm$9s$BpDV_(yeR|DosKOi76a7du2pYRwAopO;%TrsEIn-jK0pO_x_5&G$0nOoQ<(;7=+Jsex*8-~rc z7&9;CjC_b%UPK8yM~LebYYVp_`6bEv8}_Hzl;hupFUq!Z4ra9w`Iin;W1zG5#5(;Z z{iv3CKvrEVZLyJb-z@NNBq5<#>H-1sWj|&oL5ZGMF0)x79)9{Nh#@^Q{b5;doQUsLn?Tqk#ggsEsW_frjf8#uVJ*DmbRNMV9 z*1cqu{##2hfyC(TJ;GrdqFXf)^d00DBYHz2=&&4W35+VjyBxxI#wFu^_8@WS2%FYS z6QjEQQkAA@7B9W5+8itt8>>etf2aumzBJq8v*HRl~|hCgK;2vv9fOv!;$NH_+xFKYu>d`bYA%Fp{*&y6U-%Ze$d zD2P?{^V6`Jm5dg9be&*#v3`@7KCm3Y$n4Ho|MP7CLH*OT-eb$ZgO9?Orftge;Z0MN zJVR|MGf>rSF#9T+TgP6>dy7d;``Due14{sewEIu#<9&(G0izTW*c7u*`OGZlY@goO zAj6{yQQwzt8~Y_T!L-5W>^J9?ak%qbI3&}I&%!Om zd$x;R-sjik-w#Qtj$X>v&RXoGfwbK9xMvjS9VRa@XYbB_??IHdlEE*jPmWQEn|&28 zAQ60aR>cCv9@vPg_vp1PM;Y|<#f38o@3#{Chd_aUP)MIyCq`_Iets`ot@cA{of2eP z|9pEldo$nTdn1FMUfSg&Aci1Afi)vazrt2K&PD`5U{`oX(s5hxa{4~CUG-Q05|_kM z%f&0~WryCNSnR#mnIu@RY4zhb1r7^sFz1^OaToHZhx81r>cRaTWkTC$o1Y3N@)}}p zTHiWIASUB12YBkv%hKjg@%&pCJeE@v7-KrJY4t_$FPCTHWXe~LQsGwDV8lLT!otX{ zmzyp9RC~pCaQAE8_17`*eT$eyc!i$l%^r@}-b7R4hXt`ERsAK@j)_W~o6B9*&kgR0 zhNg{fN8g+ls62q#DN>B;>1{0o9{9AT!0El$M3ceTSE}je)eZv z|E-eA?@8zT85oi9eOsedejfj#bE;1_7^TRJI@c#BV;}PUqUlBAN9{?h_^P)Lk*yBJ zGi*wPV-!G^l@lJmV#O-CcO=Nu>NM%ge7>cBw{dN^khBbP7>(I0nK_6QdUQ&MUj+2|nEfr=6|EU8G)LG_$Psyr`^s zr}lPXWpF#i0h&g>A=NY#_+h0Yj(n1~Zt0`S^Xr_{AW~YY8ylF)gH6ndKFZj|g}jT@ z>xoy)GzRpR6Y`x>ryqeMqOK3gRaL=Jw7(Ubw(mvnCdsz*T6`lDG}kZ@1N87RfBEUJ z1)<%`;`20%yJe8W3;6Cd$bRuQO#;RJf?T6%8EV)O+e$x@LY(~C=DRgw%L)w7>Isn) zNtv+l?;qd`iFVt)v@W6&ik#NYR?4>RY;cRFmNKX4ed;)o*KF!OXF(35F=Iv?LK1

d@+@_W6DhPT_dk6knE1=Urd%Id6FhZZ*s=^zjxnv@*iCFUR)@M=1>U%n`NGdEK$8ORGxB?A&g%oIyXsFily`V% zyb{amo73{iUGK~hzCCFk$t@x!cBpdr^p??}?Ood#Pq-Tuyw1BczoaE18f_eFHpq}y z5E@5CXqn~Ky#H~t7*>K+{QI{ocbDjlCgo~O4*Di4y3uI*IPf32Oy7VEKn z#4V6tvLpaTnmNu-?7%h+aE$`GQOyFjN>|&6M?M}@0Lki$&dDl& z-e@$E?7i;ESl5U{E&Z!Ma_MYSkS+Zt{8qT|B*9@__Lnyu{(l zedbSB%)iCO#Swc-1SlbSwCmXQ?Wh+uB)wtI_(8K)r1MPD(*WS3A$4-SFDQ1L zytBxAj^A$z=7HOMU+AKr3YqKn7RQeKU>pxD?rN79rNG_QV^EdW3W^-8yjzo}>q;cc zaPf}t`-18wYbzmey z>|-TgS^u_Gjn!uY6z`h+3_4BQ=jPl_bY>O&$B1C7P%A}GF(fl+-B*=*PH`IQHM(9d z2~_D8!XEWCNRc1m%Gjct#QjS;(arHSm)QgY-o_ua66v2Acd`OjrbabhBn@BP3>Sr+ zEQ?Sng?=`xYTxIWPu5xW7+deK!S8>(bBwnb^9j26m397Q6PX1Tgu}JqHS@VNxwmsy zQsRQlu4p+Y)dtV*uorg>I8(W%4Q4ibsY636o`_USQ0a{d(&a6OQi%m~1|HZ!N<>6U zMflm{tW-mU`QE1ENggZsUiOo`h6<0A6XLaIoC0o9kL;3XG!TVTYvMJ1)}AMdb|TWdh?mJ-s>XmZl5$sI6+|P$|=J$ zbLpLMY_$3hdR|jgAE^i_h$R58AsB#dl<)|a3FJs%l;}b;dS*Fo9U6C;g%tWZ1C``h zf+IlgsVfM}<0Gr>x}^1Y9)0lA#TJARwEh~=Ai-4H@Im%oHrJD~qL1+PN;=4O znvFhMl^#~-c?-2iE-~G~Ym0_*y@^w*lPm5R>BXNIzTr|#>deidz2|Ki{vw@?2JGKP z(mvuV*}O}0oEG7nkkESNBN~EQClJQyB`+gByD^Wx=UA6T<7>?DANtN4$|t+(F{hhbu%7`Zlu7# z`&Gh+mQoJeX*bb{`KqO3v{oyG$8WxML)UH6MZft75AAuTe~nce(VYVYzhjLkzp1r$ zzyb)Iku_`D6|@wLxvj?JQf~9LPTy!~X15A!6RXX7a$jtD={POQV1Sr-ZI+SDsP&6< z-J@0EdcDmGOFSI4^xfd%BV%+fRacopnntV8!G1Y(?ra&lnr9l}zL7QDDzoyZFPZdD zUJz8&3;bXyPPBf|X$fnP+IFJfN3Hu2wM(a?)ebT{6;c)bwBUJC#YE?dPQ(9WXoSA< zscuO;_Nx}7L&1UdPWedo>{Z>O?~3;7e7SM$<`KD3*q_}$X}LPrOPwnuumxN9MA z9a3{7!2(gp1YZ)IA(728p!=%WF7{#6@Eex5!LspiBKt?5R}xX|%kWCW?A+Kb zXow9RPCCV?yE{D$ugs2!{C3bWbXJpam*Zr9$iqSe$ZK>KO|OegUw-(~UOzF9u0IlY zMusUr9k<{Uv&1K1TJ((**_&K~c{^T5`_s9zi)bucC;FGzd?i)uYncX9#1~5xhifXT zitX`@%cAtRioCRG2lgl+D>cXIu&ROXK|?hfZC&3eQ#WJl0Dhg7ruNLt{vtCJ``I5A zNIhh0sjQ#GxMcCZn+BHrGKS2&UIh(|X4VQyGgLQg11+{a>U@!W!9kAFpM2WYr_(X} z6LkV!Rd+S)t}B|pO}@v!=o#<+B+sd{le*ETmHwO%y%6%V-qxvApFax|HOw8OwCMt4 z^0xT`ZQ6bN?S~5jWIrhlaGQzx)UpZtEAHcaTM8xro4tJZU)y1xNvprs&#wcfot<^r7Q zyqbk<)vM=5N6EPxrMrU+vk`-!7#u*vjM;UMR4IKBE$1Z}a3N z&16H-XvU=RRA;fo3a*bWELoIY!j6e@rW?FUAM)AuAi)?f$iBq+$pSHnTYv8naD29Ho^{%{gRCt4pd=EVs*t=33z=#?&OoIF9xyqOF<>vcf86 zAP(xb96TpSDJ92bf=)8casm{Xf^BiAjfoD*5_;**E9Zt<%oz?>VPrbR?6LxKrBbYg zsadyTvlrm^e9|$-cMQ}7IvIg3YwMx>%DeYGQQi?t^@O&+GXuy6jh&@PS5JCRgP2(l z)=p55MDboWtn_#ZPrk{;e*Sr)dSrLN^oB_V6LzzVfRaP#_A-^cDoBjYcwF?pC&Wk~Q>|nk`h`raAi=Rjx9mDqsFQL_2>&MYEb}UlrTM1zu0?Z~F%nZ8#Z9YR!0|?CV z!-p;8HFk0aiud}IB%81I^sCSg?*^w_>Z_`~dtqdCHc_?I5+~sjIjkNUBVEB>>>k}# z#uhg9;i2)CgPem#zi2C0+MkfGMo;!{nzx21i@zwc82=@MY|ag4{R4se<>>~tCA znHk5YpiQeIsnftl%fR^a!amePF^wokC5&6be(EAn8T`25BCL+WUVWYFXZD*?UpbpfyRN_#xIHogn>iEqb z2jSB*gyy0c(W^Y(IVUSTr@E$`nNR*-~9ASu|aK3mdq@ zCW*RfG-Xu)PS*)VeJ(-gXiT4@zRsl7gpM;Afh06R{)Qm z7n~(JX&1puRRPRpB&p{*4hBA%GX}b_rF6Qfmkd=QkJ8a|{twH;{6NG0al*%u1gZ^Z zaPN_JZ{Wd`x)2FV+n0n<=htaAmNY%EYzdN2EYsI3a~jHn#3om0tP4}`9Sv0e8w+6K zXjN=lAS8mS!}TN$Kc^g2jV{byAo6yo6w$C$S$#DdDTADj{WJwK0gUqwB%f|8Vsg;4 zm}zTk8}F!C)~qLi)$x+%p5?;zI~$o$Bzkl&yc-tS!iWIcz1pFF*n zf*lYL6Yg>uWQ->jZT-P=Y#3&HCk|Scg}1tk?o=ed7b6#ZZP9s8<8fKJCpF%0jfbUC zLrD)V?L&B3JFjL5`_`v5=t5@po3P$78pEuV7_DZ``t2``J4#J7j*@eU=o$?Am&BWg zTL`OVkhy<5H1BP{c#dI08yu_ikR-@%T}^9RbndM*3$av~GcEo_r^gegg^v@f>h5M` zN59X`U6B<)xI}$cRpHET=`?&Hq%Hh9j%T$3kY}zv4ODsR@@_3rCR%-~k?BaL{?l(_ zrp8f6zWnT=>hw4&;E|&i(L6ky{fntW#J&d+5^`~`X|5Q_izqe*keIwd4`-zkOqTTqPsFCLBa2%=TIiEXUf~LA0f!d^=->lwa)#MEy&Wvs%r;aZ%fx_&ouU;Sj{ESU} zFWWF%72+i**YE}cpb0fe#~k%VR~J!`e4+yhlz@t>d*BsO9ZpEM+u!M1W=JB7?fKwD2)JhJ{!ht@p8@v16^V+VLaXm zjKz$2XRL~5tJdZwCj8-t0%2=ay7rEW@qm^O=! zPqoHH&_xX}ID~B4%oxf^ZftG|FSa_K0+^Z_DMtdKbDwz-Sd7Mpk>N)L_X)4pdGf(a zg%baLvd>GS2=^a7$*VNHkK2DrCLA%)|MV4JB5Y2}q+Jzla1T3cJLC2I>^*q>-e+4< zb{YGH*v&AS_jo3Q<&!q(*~$2l*UVDz$9JRn0D7BiMG=WjtcHAP5_elJ>xAy;$H`2j zG|VO~#jQTSM49$_9(46MB#AeZTo~vL?fZX>)~_$#_LZJeXQ|)9S5$q;0_Cuh7*an< z#cC-84rz^Xu)OBvqnOmAPFL^GE$xBaIOJHfow+XlwcH3&`iUCE)Qd|8~VTE1Z%N5el%AJ%rK6fe=u`p!Adn6t`WB=;ejFT{Xxku-z(N=G16(*qcPxX$G zgle_|SulK~PTH5Jt@Qw9dbm!#aik-y>uAtnC2VM2=X7ce=yXw3nZ;juUqVG;~ z#}qg;v{m*Qc&x-lyx+CvUgC6nFnYMr=FnRc+YO&5s*h?j_ZLLIc>+#wj5a^8tK%Cn z3AEvC6Ac_-sJZ)M5Yqx6?lu3I7xLLWLMxhPYcb3KM+D&EH20uTP~0vwy{Sl~?a@$` zlEpwj=gq;eYrrP2y_L_#-~waWmd6HB(<_ZH;Mno+s98uagv%hqW}?8dj9)Y(X2pAeeVM>_6iu0fqvTHaeENNa94yeOL3Noos` zC@JVCay;g91|I}boJ#$RZVoFvDIzI}&!vtRQMo?oD0P~(L*Pp*J#p7r>kaYlSnzJp zNutI(rj?l$M>EPg1HjKscmr56Ev+4l(paI-;22)pFLpkrTqygLB;{X``US{!d%Ydom3A$C!D8& zcM0!#y5mat+cuF@QQSfOL|mJlZFD%Ryep^V;_vUh)*AqxgzCQ10-!usua9)g;U3)n zmnmDs3$Gks@8yZARAmUr*j^WTM^YZ|uwDS1+bgJg`RwU)(@MtETl%tv>b6tXeHCW0 z=Q#qBcsQc4q~3~C!k8q{x--k-UU?-#Uzkg{E`|gqd)e&R)#o^xiu$Pg?zp>Mo6@HH&%5-h;XhkGVc0jk0kz1B zws+W?IO?+;|3)`|iCeP289RK%&%WyMtq!?rr@*85S9W2d3goQ8gJh<1MrG3JQo8%Y z4v?=C2u8^{L-AVc^sSc+0e|nb?p+q4aY015(0M(^g-MH}wu;L)GdDhsNzzBJKyEtR*Ku{IN?j2KHmy!gW|c*06K~N>(Ha4)s_;BbEu(Co zYd_!7_BYIdRzUO3OUVhYc+Ae&Ht6$7d>kFhc6@&A4o6a?j3TBsAplQOrdg<0Vs#y{ zQN?jcRQHyVUy18>I1RU0+bjzQwS@p-Gv{IwZ^M0T%uLe^q}!(7=Ch(TXy7ef>qvU? zWq1?wxGPVYL2h-$B&2ulc#kBaxsCAwWha42Xv$v@hHfZ>iu_Ur4I@Z89v+9l(=tB8 zL8*RQi2VfXjz2o^ctC-$wo|_Z-9&(mK9DGNzoz45=AJMOY@QVaQhf zNkI_e5IeSfYM19dMRY_W7QZ_#sI(9?^%)9VW1Df43?wtUcQW2bI9mC!?$)*@aAQ9F zr|kTkVzn=j?4GtJ9!I`)5R!-bJ!S758*8vq-0vbkf9rs#BD z^^>e~CiG=sz4MBHmhZ9*<70;fdv^P++!3x?3uWj^;^N~xA~MA0rro5K!=dq97qN2v z^2IMFMoQS)_9a~HFD)xgR`s+EO?G!JFavue(&uwr=5q+T(aNsXuC+QwMn(1Pk<0EI z9w3h}(I-u4?c?}cp-*3$zAvs-!g3AXKnb`30~a{U_@<2TVw#Kt(QaKbotSvNqj0;J zV(8Ce?sv4*y0?}wss5YHWaU2>`w7hkbb50;cL2@(nmFC)NYV!nc*sP9=zLeVFe_xY z*o9$EyeKaMdYOyKHNItUDUpM z@<4ySH+sG1EiC!#M)JLwm^&S^@cK)CBHYmkCx6m2eUheVw&=0MJd`?W|7LG<?!F)~WM2^|C)A@9WtW(@S%@ZaOzU>%ku0(6+B%4M5GKrUzzTvgpB=gk3%?6Mdx zh6M%|dAD9{_NaXE8UqV>g~Ar4I_ zf`FQ~T6u8;i+o3sw@P1GLMP5S_2ka0fSROy%1V?nNa0JjxU0Zmd`&pgOtB0y)J&^n zH^7VE%RD>zjA5Vr`^G6J#_vwQd4O4JvA+latj<1>8a#5p+U6p3bxBp<7dqPbKo}$`4YWSJHHv1-*OUa0SFzsVW$7)0u0E4Pz5I z?f~i;>PhTUaX`yEs_woj;ms%O*i4T<8}hQCrRlbJC!{gSq{g5iKF+}&*wyTjGf9r} z+qo6AQo+2gt|c1@I>PFR1@g*UtjP=qXGvXGU3fV4X)h%v`I<8-0#3JYcUU{kX0)&E z%7N%rek1;4XHjh^;21~l9MOq(*EX8Sx>heO5_DV*mWC4fTEC8{rNO8r!#v6jSww_Y za7Mb9+s!JAtk@=gmsg_6_6==e!kZ03n_2VSX#RY14sk=_md%W;ptYnR0IK&ZZy8q{ z3$%xu@sT6rIsz(d8$%@7rwz*uS!y+(M6T8I^ih78zbm35F?PxX_#XsAOOPDFY^&$2 zy$ZhAHGV`DT`7r%A!8lUiG+$f(Zv|$_$}i?*E2@rdZz_{?-QMm@oL+0QMANO5CC4rE$prEtJFalJDh-@4@b!mhN3X6sRk7{exw-$V|9s)J-XVX=H!d zu*1>fP~Uz7tv8G29l5jq!jeXA7y;udsSnb#i`QIK5**sRKRus-jt}s1jdL+02+LYe z)vu>u20ICQPFw2;Kg8jd`?6EdX{zrrs&U;CfjFtr2(7?kNqq;7`!9)m=%mIHkvy%2 zPwT}t^^(5Ju&&P~O7a;OX86e2ka(G%M_SF|3tZk!L%(Yak29XS`mZyF+IfM5BKqRv zJp2#0LajA777g)Q?#qI5U-3@M1Is)nl%>-2Od7VrSAOxYN-R1bl-L6cIsNBt@a1#8s{cy3?K>PBIOn!k3ToGVB|l`u&O|GP!Knb~3vYuilJFbWGG2jn?zZ zdN>BE9U4PgTCGne+!t(*2yQNreHyQBkC|gcSLR4FVkB<)>#p-s&Lbuc2`^5q_Ppy} z+TTSHb!kqM6aR zGJv-Q6E^g<-deyH1yyHKH~1pBZ)D-{*4DMWbGxy)7*I$4{Ha$|&9svt^4lDt8GV;V zSwhL#q31n^_mJBZph%oK&BIJHHO1*OR_7Ml4JyN}2tLJrG{Ko5*Z= zJ>DJG4Kc1E_Mz5$i<;9VefJ-^v^U==^D>q}SEHTKeVbAzv+l?PLI28A1^eGWtY&k& z#<*TA9urxSXPlIlK&F$SJ&N3G&EIC>4<6!DjvN~+KAmp%hAWm#n>Tf*L9CLV=t+84 zm{Qzji_Q^4OHUPfwpdeapTe&;V#KAe50?FQ(oS!22KaV{#lbJHAye+XTnwoqyDo+t z2035+e2;`?&X49gpfDge)$H89dI2Apbd*s57#YC8^hvDVJ^cw!*l+L20mZ)I^ZE#+43XPd!p+ye1^i|eW#||dm`(S<4YmBv; zL?c&XC4QNc9}jw_dlunhG46NMDfXKhT~<;^juXS8E+<|Wn%ZtwslvrZsg`~edn+|Z z9`m{NBqK3?G|t_j^nN<$Uftc}l2>g;O?Gwn`n-kXWV1D5;Ye7XWp9VMiWMjdRE>W4 zoSuRYJKY!T4_Gs92P!nW9Kc6aWlgI;BO*rNOTPKRcsUFx5a~<>YKh5B0jUogx4-@0 z`c(PDo9uzV=ZHAnP?hB9E*OcSJEFU| zOfCJCx(W=w-DC?`gsFf^l=e4?V>}LuYqYM@q^|5lmfS=Z)$Tu4Lm>yze(-86Ftnzw z>FS$eL>b|-M@)+S{D-4=Lu0B;^(OgghWDQZ7Co#|O0J+$w2P-)4>=O>i773f?#8OY zr&VXfnb1_15iU3W-jTGrhdJfj5I&L<>M_u}ubR#2V?)mtL=DI`2Q^g0vIb``>3!68 z(bK{A*XwmKe;C+)D_)EHoCnqdFFi|d#PRCmtv{%hYSOhz>uPCCUH%z4`8(9;ur{>m zwx274i%%4$A1WZvd)0>}0{6lF{&dAi{x0`0+rH_JV22GSe1P%5ntR zd|dToH)@J)b(WszT1Z62Ew!Pc|J9iMl3m@3;eMRsl0%HxxZUA~ll3dMP}kevQ7Hpb z+31n&S_WHe_S-)!-6Oc0)lOWXx<_I(4bf5}`+LbP^VDY~FuHY_79*>pn#GuA?)#`R z|4tBM#%ifAACJq+w*1~3dbJWgal+^~RY9~~`&je=2!mihM1uJ2Lp)%$QcLPBR#yUv z1R3DNHO!@F8mei7Pxt#`65r<)`-gcRj>$Jf2`^@@2rb9XUw1v-f>FpCM;)~sX*-WSi5Si=ed0GGc_B*E8A^Hl?%{U>H~aD#Uq_QbLOo#HDwKUwKrar zJ+9VQCL>GkJpQV_I$LZ~PM(GY2gR*2g}58=>GgMZgg4Xeg|}2GE53Px*hyE#OpsAT1zZC+*;Oi4JSjkz@xo+tjG=R_7j~x5Fd8(t88fAIjUP;XF%2M zjTc;B^c?}&L-0jvcSDB321Zhhgq~&T7KjimHX4s&J^v-Q{`$+cG>>IBsrTzW7PNBU z%7xOV`og@_e$eHWnLT}(NN6cFvfIP&jhHIMIC+QB^=7A+PLga!Ct!FH2=3Sf6N4t3 zcVwg){~(gex|?&2i(2#miEmy24H#t~*vvkhHDiniq?X@&EFyJI61jC!jiK5azPw>i z!3{1wN}w+FSh?L6A%bAVfbKu(UF6%$&X}g#4Gf8#dFOhC!#xL+1fZ(^_f<0kOX}V2 z6cml5xXT4Sb`2N81QvE#hTt#>M{%>6xm)xWBd57ssJs-7NS)b^8%ZF0;0ff^j~}{Y z)ZId=WtdxC{#3DOmWcP4@tp|8^(OdF9;Dnd5WGiWhQKfVG5ZvEMyJ4(sK1EDY&!mR z#L_B*GH!X0C9dB<9IZ)6Pw9p!*9mutWlT9<4dP^Myr`pXeLaj+ufpmSC-mw2VJb( zQQ^uZVO0_p1Lfre3m!T<-CE9Vr=CC-a-;?GHM4%9uo@lCv6Agu`|BO%;)o+~6NqH> zUI$cFZ=`7j52I~}2@>)Um#wl0lE^GGQ#{mJ_R_=zv$tIwQ$1?>?2k&FK=L_iVpUunt&7msy|A(nX#tT$(u&v0*>CJrj110D_e7$?=N6Qtq%@Q69riKa2QSFs>A-+u^S9PpkgG=OM4t`{MUQr2sJnyh0)-yOT)4{c zX%Nq|LrM4FzZ7sAtZGC>ooTHI(aA!!!3(|}W`i1EGjzDl5>K~{;nrKd*CP-B*t|GV z_WrL@s0t^Z{NNFj>uw2>Ss_2B)=d&;{MHTHqROAM77|M{uv3aGnOWilI56JK(;T3sUpTZAAtk1G?Ey8>5SeUKkE^WP~8C zaSL01cY#Nmet4%tAiZm%!WV57JqSc2;D1kZ=J%D$YJkyUpndPe%GM$Y_s}A^Y-8m3 zbK3&f)_Bo+7Se+D`{(5HnMjN>I_vPAAO`^{U3b)Z#OF9(nGM?0d;J&x=>1M z5T8*5&|H^ey4OwCu~BUuG_?$=T?9y)jF?+ZUZB4KHFe0)albP$b~+04F?*3AF0vg8 zsX2TOWc!7hibeDKDcE>GbfTwS6vyuyLvJdh(9<6c9z~Hi)h;_+oc7EhrdWKFimR|G z2Sh$O)nB5>B6vucJcD>h7O&}QA1p<$)mku zs_GO_M*!nAfeC`BS|dRo@5%G(BHAD(?rC9ThCs=YGAgdK0N>kr+GGym0aT1=$D>f; zY0)(-FH{TcuCI4co}r`4)3BA|kt$EopUT~}RK-brd^d-v`yUH*qNeC6NXy4^mN4B) zV*{J@sc=)>l>{PfikI8aExA|4nvCq}w`OZD{DOV(I-6oak<|P|UBP_OWIpS@kup&crAcHWw*acory^l3RaCVtYXE^3E*LNaQ`X7^ae` zrshX8XUe#i?#PMmS=zYE3zQqhmgTa@0mF`Nl_c8{bP17sT6=6Hbb-z1!y=L!RNR?g zjyx8_pjqL?K|6H_SZz=GSWkNgy)BRN$yL}&h0q7?Bs}u{JX5El`_FGEoBVqP6EV!C zo1&uv=E|8D{l)U**1#qbs}QMpG1MkP`V&c6|F5{cW`bhR6^R?)DM|i-qQH0DpTLU~ zFru5|We*KL-QvI(Sa(ic)|Z!qi8b2D8hOti z{+&nmn(A=@4=BQ;ptZD0t4ij6^hz70#1Pln-M-R7kH|hH+xnzN?IZ_Za^y$5H#~%< z_LTQ`um{n4?ckysO3S1c$CN>+rP)j*Y^B?EHUYClQcAz|3bfdaoAw2Fn^;xCspw7Z zk++nZ3m@dX5^)t56TySja=3&Y1w)#d&+xnXf>s8Si>S~9k8?!ydm~hKhotS4&E5Z>LW@m)aKHG-%S%dDz=FOqAA5D`#q_i(t+3{T63doxvG5VHd!gm%4_dITy0OEw{AfMUq2=a0 z%9EvHa`@?FM!K)6p0L&*j3}u~OKH1?mkLncxqQ6&&J-Xp%H@Z5QN4mf0lg4lO}9wU zU7XZVj&IC4@T=q5EU!VWZKLW95W9mm!AS4=e#a>tTBbU6C)@GwFHvu(w5kKA zIMhi%gYKW#fGBP*CzC`y3?Npcz;Fm@ho}vhl4D`wp*45{^Vn@9O1 zj_MS>_i_`%@m>u7Sa8@&`h{ANFMZdYK$&esvr$JKYG_H-3L2paiDn>c)$r>ChgiY8 zBf1b_M^a>$3)v!KMqBu(i@O4RWZumU+o{=w&uPD-ZgYIi|KYO|{Tv6KXxAJC^g(U3 z58c%30GRLvF3s}!ozwXTZ-Whyc+A_rTMn^S{95sMy`s#3Q7_qwHL9S!l>XZ_|7Lan z>r6eZ`*LpX6Nh?t^NI`~#V@<-(af2RV=YX3wH3dg@1$!(N&Pq6X6mnZQp$YGzkmC~ z2f*bjfJ{l^YQ5He7fT=36;EUAb?NB_xVR!$kXUz74!Y+Yy@e%!lh zZ?2!o@aLba^M+lMzt7P*nz9vB(K{eikR6sf%IJ;@i8I?dl3l3@CrD z(JnM|wb85)uHc|yE{ghGNsHM+9t+z!iv*{LyWLQp+%X>V2&?8NT`a?|TEnH;9#MD= z2|@ev@5f!WR+(HMeBmE%ZrWSQfoCqiu5*5fQ@Kt((G>oe`; z>K6b8WJVusps0IuO_tIG^Vt#S$<2xoj|te9L-glR_dNDk^vYjwj2psvEn?tblH#!J zwScL_@n3QlCY|^gIByFjE5^eSt2X}5^WU%H@9*RvZvq(=jq{BT_W?wD`?|%-fqLR- zara)y(5JkA?VJH+SvoZt+5%Bo`;mme+y}yzEA1L$BzZqrzi#O1D{w$`jQ2ZYe`PYx?{Rw!lV%3=z z0ArzN+y0Gq|LePvj(PMKajX9w;{L;ScZpD*vhR)t0rM*%(Kfh+mV*#|6#eM&0F@Jx;zsoe|Sbu2-nSnox<>ns$!w&u?(Es$C zKsv_1h-dvD;{78<|3ke0Hdp_{nbK(gFG-#sEaSly27#xim|f0^KMVc$g(6H^gcuNd z%tYQuoV@gpzX*o2Dp~(s0cIL4L1XnaIp1xu_Z0;t{=0(sM?Xuu{84Q*A_o7qwg8yV zKv`Ok&eiec<3KP0FO7xrzpD=$Q~%P30VYbv%V1c;bIS~ftV_Yatrr2OLhvuuu@~)i z(#2%;M_-|+|NHuC;E%16#RU-6{-dw{ZiN10Z2p<=^dDpMk2dD-_x~|A{~t0o|37x~ z;<1_^?h*Du*leywcm3L&UD{9nlH$NXmg7FQ4AEO|^C>CEN~G5oDje-O+34p#v`t@o(Tk=cNoa ze_oIH-*1o^*nU|@03gZ-1hXC)r>T;zbb%VG;!A0M+s901b&H<9QRV5J8^hpmXdlsX zMoWvNNS1av{<$<1Xk>RA0l}Mra_pIK=M1}A-6fQ>krfdC#S^AcOBBd*E97cD!X7xB z=O}=-ER@;F!b%hLK&PLJNKP|_I# z$NL8R`|E8RQGA0@12kvnn>#{@)mq@Rd_6Gc`TwEnEu*6T;;wJHVN|+N6p-#t5l}(E zpu1JNbLbF>5$OgILFtf=p}VDFh90Db9+)BSAOGvR@AW+Icmc~LoH^$kdw=%Mi~8eN zJQV3stb@L+tY6(X<>e0){vxrLDLq0ZZvW|`#=AhYkQ(&wk|q&NM)6|EdFS?oQ}T4+ zPVddGnNz*l-r6~OGbbz~SF_Q#yse~cJLxUMMZgsR$Myy+f3&DIb%DkK&fDt$mg$d3 z1vc6)dIahMPC)Wk<6~#CTfHq8-&k3}Z-2C|;G8r&2cBdmPeocxT0s;JR3R+uurpO9 zkx{|0|2Fr4j^w=*`E!S;(I)1xu%j(pko<-zYAL@Y*Ap230J4yOKhwTTQ_6y1X0b+Q zzMIsj94+_wc{}fuVUhYw+nryoO>d!TXN`zlZwSWY26fcul#fsA;yxmlnt4HLR_xKL zf?f#&Elm~VxUvRbgk)O9OD&O2X(2hFk3ZmI`}zW7pO`BRvFRiyr(956AtTnD%^Uxj zJS>5!$)SpEm@gtb;}P@8QjOF1XpY~Pj)H9-8A=`8CAWX&01UywBK#)Be{u956Yuf)2by3&4o zT>)q}3YPA43zKBoCNls0)^k-2is>i z8ogbgRp@N0Inln)1KCC2aJh z8t@TymC8esZkRRfSd?4jPKXLZ+blS z%zPWW=v4FTi8nS*7oFO>99!nM@blhj%68o8L6iw*v_~-)!C5@0BiGyfKnQRY`?K8- ze8nIW4$a1iFJjC9_iy(7+~UoGg#G|NX7SNUoR8#~PlUk)m$xNg@qB|EZ(u1W(z zq|C?86w>`=O@mtK?2=>L*qFeeRh5IvE(jNyZ>RnF{H~r;-+sNjS+R&Kc&ruGcpU^b z_lT{#Y1(bLMlH|2mfEDUI2MC1=GQp&oj?WG%$-~O&h7lr85(E-M(2inneACZDplrH zshjv{@BT8*3$Ois^U?c9TN(D6-o3xNmKC!1pg@PA|;VCUoKp^GZE_(Na13tH_6#%@pZqcHM zA;f<0=}#9e`STZxC&vJSzByUXM2@?AZiMZHF?f?YLffkS9~aJ}72^t&Gf;uw((D0? zjLjeY%ic>f_X&!x-G7-q&ub>S`Zoi>UFxoUm!q04LQj_8s`JywMJ00xzG&EL+Wj;N z_eN#+aA#z&o~htH&nv^zl)a?#9>2+if4SATSi-^4Spxda74~TuENf zbbM53*I&O3Cww-0e@lq72(%epm!E~Ij3rp}FJB^D3tUdxQh@ml@ip|LhSU};YhEuO z5Ah*>W}nOS49*{SvjsFDZwQwbcFW9`d<&`dTo74e0r%V{eCvm&z5h^W>zVEk-{(kL zRgn~r9zsd|3twMqvac9X&_OdX9Ch<|exI;{$a6P-yDo1=dhI&bzTBHUme9yC^SosJ zW@P$D-EMj3Bw#O&*b8Pe8)(&YtR#=YLXz5n6hwY0+_v9h+B>n%Lu2?U1olzPC*mb# z)vwy&cX0vDj^1+<*OzsRnCx}daorrh!GSkgyN7=(hsv8(XldLS(qberr6S9K)+;=S zUjZI4#OD-KR`;~MNItKe$#vU#VNHzP6Lbx;z-w8&Z8OE32-v{ui*kj~2$OFfc%X`Y zWS3b{cy4K(Ux)oFE3aW4zRtp1#OZSomYc9Ezvgvk$J z1@9fn3uK+f`E8}kw|^>-jl!&ZxPD~A80?Mf6pdi)_CV%hCZN2W( zmqVw|ouYsfbA&Cfcd)*GqkPFy4|i3Yr(pqjBg`nSz}45RI|^HzRK556?@q+Bi8_Vc zTK_S)M{*0z=(Kl|*bNTEX@iU@wfUm|y&>;7gxL?(x?cY(%AJT3N|FG~Z?GkFn+#%U z(|5blYBf#*#0LfYz;2;#-9j z1m5+Z|G+iH#I>-^=r+Iz-i&NpMAhq=?VrF12HjuWuE3cXOicsqTB!$5_T0yIPZr;v zwjOqFx7k-Tw~aBnO#cY0CdC~ZYoP62hC%bV?D|jJ8NEKeX2_rkiqqtO&IT_y8cdDG0_jWq~rx9IV_x)t3VxH-a@VeRj#ha#A+#*dW|A$NH z5dfLlPt|LQad>}EM7R5gLXkVd#QRL0@bb)PEa~VsC@+>xS`nIyTvlb|Rm}3Bhh-$OLqkxSgo_fa3wqfj2M=s+4|(nr z*XB(alCyZcQfw8t+fw*umD93Z3gOaseNIgt6#acmS*A`FFhU=T<#17C63Z8u0H4qHJs_R(=@}b_Ob*?}cL8gD|o;xeB9Va2%Du zO2Q3%M6OXl&WBFo<&8ZJ}52XI+x_^nt z+NUu&CG0^A$SN%A70vU3R@@NpkWGY}!N-bFaIVksz@v6}wqsqOlUL>KIG`q$K<(I9 zH2y2fEMJlCz@wy^t;4kI-)O^ge2$+YWLiVv^@ z&o)Y2dG5oH&0ERE-Mr!c&aPt$#%@(c0adKjxtHy!0oi5_1BEfUUn|sD8gjLEYlvzi z6!)&XoAFZ7tO1Alne*);aP&<5xAMQ88c@E4-L|ru?Sk7zDMVV=YoQ*AF0KLO2V4N+ zMvVd_!D#sppCmGoF{MGo0)vfoWb1Y4Wm1^aY1%|vSx$qCl+@*hnI^})gG|z^anSjf zItE$0GyeoFkr7c9gS>zg6N|#NYA^~mv|l|H|Ld_Kx-~(#BTkzid;|YscJ)2eXDzw_ zr71+h&d!1ktep@6@L<~|r6!05ud&zSi=CI)#;!Z*ML8~%=r%lTqrM+#iIR%##Gg5S zJI~rKsv)@~tl-mx6I@w|e53#$i!~wA;OBN%rPeZVhn-~%Jru~(gq-`YgZJ&tJl!u) zk8~>R`5lf((X+Exn9pr^*KmiLkqlnSCGHuTZuAaKP2o^;IM&Wtt;ndcEIF*IJsz!o zDre=Lxm#A_H6m`QU}!n;eMIe1ACqVJqIE&sk|}0ycbL4ePImj-;P<}%2P;BP&;*P_ zJ^Y<%7wv7V&sKyU_kn@V?#mIgbt=#vFgi7TEE*cL)WKMvK81zl_@7PB2C%em3$}Ip z(B`M}f%Xhod{#Q#7L&~OS`rsO_A!4En&34q%;3Gg+ApMo3dhC4xmJ}dJ_Y0a5`*|_$$HYrXjCndJBlz zM+FvKPsSv2F*|zgh6b|iPC0o6@rDqy>E!WjqS5 zC&O~yPYZO0oi;x-wirEf8M%Uwc|3U9O0d>)e9RL#eCl^Js)fl$HXtwB-+y*hx~5yV zQIE~GP50%fK&1TwuyAM0s9@g9j9Gf{)e)!D0y1FnHPX}dXFJJBCJ#!GzfPu8MsnbB zo7|CATZM+aFP)8In|DTlgLLJ|r~;fBz<2hDnE6fMJ9XgG^2(p24X*v%Nyb+HH?-Hn z%=k3y_s6;t(C?zI8oLjus{70C9-mz(t$2v@Vb$e(m{66 zs;d#MV$|&l81=XRud$PYiKC7;`q%oF-9`El{o;Lx+JCkYPIe3xtWCi3IQ7k_Ji^!; zszgW&clR~ex3h0jV+EIf_C{vcr(TN$Hu!CbIYniAKOJ>!KGQu|!3{DAptudY+G^#a zQV_UYf3oz1)>NA^qHjV=9{oAfZSd)*(Z$rVy5-Sjm-W{@vjF3;C2hCTUD``1#PnDj z3&mnv7`EO8k)1@Ck2n7fP?;Rej86?@~U8@iFjfUT9oUsJD+#$80)JjnwajG3I(jW;(Ssnz8H$ zdeveg$jp}sNX23Ze-gQbShe>6v@Gj<;F=xYyl;$zJ^?ik)%#P(?4Nd?gt980br^y+ ztq?BH8C-N^?uDEn2n)apM*i6u{+Xo4RyKQsCPhn=48`QM50QQEvJGZO5^WdOWoYn{{ zG`N<+=~;(B05W{EeI@k6?~yuX3(K+g!)LX8YBIh+?{?-xo!0@ zzvkE>*1E;j6BMo|-WdJN{f!Cqx3qw83_w+Cr3@HTRq$6aU7Z=viGuj3O(^t+$`M*e z8=EmV>$abp3&Cp6$~OPswF z8euJdb22MOY2z8UU}GqYJTFT;9b>O6|pU$0fCz^&IWa$S4@;- zab^|PC3$)1YhBXyjZwsLyvJ9Lp;Udb{tm|c^Sk+0U=!U2qi=kiuM4;zU;6P3!koz_ zpZSdWqlJSVntuHYd#_i&7VpO3^4hxL>1`H={JKP~*stK~-cLjCi@n?4ZCdg7eI;(Q zDy+iif!Lneq-Yk{VClh9B-)7vzx|RDUnGq1Z2nDAv9)zWx-^K`==DQnPRJp4{L9OT za-~r6QDeW5VOqgw^RRy5^!#^`=MFB@tQDH3p*It-JMKUM2g@ms7sdMT z_vWo0#V1?tW&hnBOGPtN(FLr3>_GJyk+F`5aWhfM%4wVp@hdsh)ycr)^LgjnV^a(H z@m>RYZMC-{eT3s<;n>%~g_N1~H5XZ_Y2Jh}MONPTx9LIn`pbW|q<3j}QY=`jSS+oHEXqK~%Q^wl?0I{$pv}THS&?9|l{ksz_kNCw;RaW)CYznxNBCNoNL1wb z`M+FwLjy5Q%=Pec$*Iz)(0N2gQG*089p^dE2ggl+ADL3iY{2Z3#hMgeG>CjkIwbsz zl9e~;@ci(o@75inaAf_OTT2%0vRY;d*rm>4%sHMS=;Y$cmDq@hz|9UwszIIPAH+ek z)RnNrQ_cF^x$EKU@t(lsL9^X%0`zE{1lwO>BGBA(cHg@jF1LgW(@8ON zTEg_9PH}DP2eyn3wXWsIB#<(cq@vdWoMSC5$2CXUf-B?Jh#4Xb?O488>W3b9ul0>r~Fe;C1 zAA=nn7WB%w16puVBM#H(ovoaprB4}r;){QoUFY&smNV9$q?&GBu{iZO|7RJpJn9)d z33S;Lhk2Z2+|!ptja@Udm)jln;K&nY611u*`OjKL$KBZP*OQoOlE4+i6PX~TEO)gmoztN!e;%Ok z^i!_=p<(+mSMEyVZf3hMyY|6RskNk;^Vh%{bHNJ9|#NWJcI^K48b+RO^E})~F17FRc%mphoWn$#vj1{!)?@g{Z z$gx`D&7boQtkv738Z4N&z#2DweW(lpT4c>w!`=9JDl&U=EHmNEd1SCDZji&~=QUyJ zd(;>6J2%_W*tLAF>w;j^m3#4TXQ&u&uA(W0r>vbXn4-5;$CEh=Vk~hDULI=}AF5dV{uuU(mdV#L`v{)fd~ajdWF69M-15f%tb#E2pAmit_mcS4 zC)hTQO%&Tc=P2pd=RXe@*WhD_<|r2+K|!1QkWAt9}+Q5o{Z zM!XF2bgbGS!E~p+3v?Z~=ytLo+2|If7bzFinA*IQ`jTG}f}2f6=zMRzZay$-jdL4T z`6yMn^(32@b*z2-(Y#ljUS-{H0sA_#7Nrm)kE$mm8+5mpdjt8&@}GCT*rjFLZk3|0 zddXQ`y^bh#4Xpl%MFIEW^#9z4D|Y25`4ji;lNCzET=}8Bbu?MO!kxNL_}yM1xu5a2 z`!0S8A+lwECQvxjWXzQ48KfytT?&}5^-;fp$f3FFCn6c=Y_8+&OaEy8TTRfLtOsUS z@rYINV`qVQe(Ur=(D6J7o^J8;m}=|~v}=&beat8;tQ@!cH@?re=Zxpc6#+f|-C3~B zmV^bdvv@{_mgr_6$1~Tf4rI>+0s~6npP>4xIbWwu#NI`RN0k0OFLiD(d6@MSNP*_oeaBLB%?X zun1N{{jPKyvB^jkh4PRm(z;eI=(s(#Zq#RR?-zW1xpk@GNm5 z^A4!4PcUcsSt6%R#vL>zoueYuVOlw4lfHz#DioXN`P&kO%J0-;#5_KujFT_Q4n7N& z95bdqnT!%+-5DAQLce3xRbMet69ialia$F^1!DQ7A~E#Y{x`@W_Xf}bX^eu4B3O-C zLRIo+^$XA2`lo?v*(xpLh9drNA`<5ahmKcGY-}nH2hWB1TLQQ~D>f5{BRO>=t#a*4 zg%$lr{S#J~y8}&`#@h_Qz4v*(k4LSvcF7E;)Er87b@1$?y_PmMIQGdnR z#7q<$XdAxgeK6G!q$YbBR9tbvT%s&V-a={t2FdxbwKuSU$&lbrOfKPlxczt#`EwuE z7W|{ezpq4XC;RPe@zlLaWqX=q{lkiu8Ii5IS-)ROJCMEwdyc4}AK`cs9p1$I=SXJL|a<%m*C#Q3?@aGdB{L2mb#mVBGe5{Y`5IWk$Kii3A$Ss`@Tlag`EUq9o~iI)&8=fc>cm@RBv!{`2d18#Zbk!^S5oqAOmo z9ASr(wj;U8%Wk&v4uBXPuq+V384eI`k10*N(3;}(3 z`eldn;?m z6{b>`pnYX0KY>BJ%OQlUZ+&=zhbh2#MXY{|BgrzS?|Nw*?_2xFGOzeP4v?<}F+Cfz z2~_8oUmmM5@X1;1&`gAMFalwaop;A#+Ja5Q&Uj?fOm?_1g4(_aK@n7ujy7kjpY#bxvUVTVz{hgDBVHK zKj-_s4}J*IZ2&mpqG{~6Txpo=49Uwi5mHqbNE#`wjW#qf`5u*92q9QJBqgT8{n0oV zfLvWwOGXe zeJ0p>F-JZ}n|U=9X2&&aP-Q+)RsEn`L2B+#zfpO`mi?&#o*SN^g{>zu(&=Kfw|w(u zvdiARmYAg&wx4mrH>)&EBWkg0<_xt}KinzJn28MYvXKT}{%&jdyFR zAB<^F=RuEyU903|dWWilqiJGrReT#vj6YmJ1HQ7LoZ|+Y%KABYr<7=HBdX*$`;_Fb zchqF2o$sq@!lB2+8)?QEV&8tBpB~IM92r+%6Q@cvYeUqw-_lv(56Q&qr#;xA(NBlq zvoduKD98Sh`YXdaBy(zb#nQduM%VAsX7PiGAgQ8<_{@3r0_XBYgzk} zLWsXBJcq~It2uTcThP2JYLi*@G1DkrpzZ+2jOx}Xi29N)rlIue9a@sH;Ct-b5Kf^k ze94g%rO~`#0RD^4t8|hm=&}k+7$+jiazhyX*Xfj&RSJ{d^*W1U(q07ErfJa;LXE*U z-LQ{z^18${td16~&})j_iH5c)@Q1XYC~NJlqm12h9s7EeSejYmjpdh| zJHA3aUNPk=6vWwBl|RtP27O(XbmaXWi@&Ykg%4>c`g4>*`*xkZB0bhui8_8pw%G`3pMdtE(`cq4|BpyreP%mz^ZBPS~0c{qLLeUO!28`2UTnf zyWYfqA4)B;{fG3~Klk#`%yE;3IRHHyk&`6^HgHEGt+MUn?VN(ULw!G0aPb>03DA~O_IB&Y;La}lg%Jjt z(tKV>XDysw1Tfti``A%0b?jO7xwfw7TQbiAzu2Q-WLAW9%$o$wY63n)rodCzOojxAxmCr^nEBOASkR#dc>$QHYs7O{WjOs%sV2x&-6w3ixyp zKY0S6^1Yxua*d4>N+6;tpD9|M15megeFkVc_McSW_4gf)R?XbJd3x9Q9UK+&HZ~q(6zp*DbDqoj~82+u3q23O6V=G{<6w^6n0_(-Qtq>?e!Z2 zL2vo2$KgU>{|$fT)adNS(PHZ6_~%N#@iq~e5JVx{wxW~D)F)s67VeI5GeS!qhf>e1 zyJm*JWTQxPsB^Qzb&pR($4ZlGQ3#5zoDZ$d$~Ygr?bG}U>5EcQs&PJOI6<_rJqiou*eN5I9lvnxCzfOqIH!N* zs!nUu6+&45)C$|F(u!Pua{Z;(4+AmtXVEf}4Y>Ap#lroz1Q=Ey?d8RjJL<6EnfBNJ zXv3{`LLh>ES?|evE{pV~UFCMk^8gTDJE40FS99e)#a}53`Aoq#|BD-@{;-L?smHG< zSla;5rxGR`ZHebF=_Ls56@9r4@0i6us*B_~J}NA46Pte_J{1P37aE*->pDf@;_KcM zED_v)FJs*v_LAS+o`@&Wi=L|1kfRL1#(ab308yiQ=5MP%E#KkY&cw1bKn6w_NI+h> zfq4XY*dK2#eRng-m1&4>gicf6ltC_(oe|qYjF!-z6u_+p#L34f#$qK&Z@=3Ev&cV^ z?|9IxOMoxLjV_}ls{AH>8rbb9(XJ&MNJ`G6Y}?WqliUSt^-f*SDXcnWXd7-F>y@y1 zUOlA!frF69%B#95qc^BGm=DLG{e+oQv9+{CzdM^>KHg$TZFGf6&*x#SL-fs^nSW7- z>vHZ^K?-v*ah$|eCWKl`(8cH6P_6|Srw`8y{F6st8UT#@I%#R~=-utC8)PW|E)vur z0U@6cej*{y=(Y3FX2FMcRZ3V4JKMj_D|eUb(kFlp=;R5W0ua_yCFA-`dYSrW`S5?6 z9%GH@QAhj*ABwBtGnieQW-ehaRvmt&xbRlA;Bfw*J{|O<{|5`^tf&HD!3U}4rzxa+ zhODYmGz)-UWcGwjD?duJUsg~stlptNK>R>dj%@I=jo$fths{2Mtn(#7wiQvqRWmVj zb!>G;zr4B52kNB4?bGDV1aJdFoA&ifOSItj!Q6}CwRPC}eEdq65xarB$DZjOYw*z7 z*|BXycqo~gxs#m0;ffOk+{X2BG9nzW6LkmZhJ@qm@cZ z4(j+n`fyXyl(0(`RvGB{;lqyVbK8%C!W^}~T0)0?9$Do9;Q*0UPy&3y+pTf^q|Uhb zWHG!YF9M#L_cwc{0739`6ElZ{o^!kN;RFl9mxQJ!C zg^8Bpo&&q;hrr&YqV+a~ML>3ixSxu)*_r2z8BNd}#9zS_#X0S^$)fpJpW&;H(0yF? zwGgswepU!5EYx<04fxf?-%Spj0(5nvW*wM6siLwCo>-1}hi!x&_ALKZn^F>&ZCNtEjU*WpW5FBX2f> z^Ma~t_la!B;q%_&{F{90uJRIykcd&hXq;A1N~^E=Tz<}7%ZHDK+)yFz^x>3wGr<&J z2Rq%?l%%FcuN{)$E32TlIm=Tv2(&-kM!8#wHE*2p4q3QTdfzTp;Gd;B|8!KV>wFqk zu8p>K*>@wg$MtFT9{qe*{9!meLd=UQSOf}J z2bPv(!eDkK!1Ff#cFBp|<g<@FvPZf4 z#($qS=K^^aul}%WJz!9lu~f0+)@QD5(23s*%-}R}UngCHA53e2?F<@UvD_yFFdP!{ zUDgDslU2D&=+i%&nac&~>)Kx-jlha+qVGH8aEdDS=0dhm?zRnIrK|{M6IMW(=F*@M@k?ecgZ-tc&>eYl;g`h4rv; zIt#Y;uECO=!r6;bzY#jBLzw%AAC)AODRz~w!v4bwhjY)l7yow>gHgM^t@&TWduZ!4 z6{9{Y5d_w#z9GPGwbF(@GmupL&7G#l9IJmFb?@&emP+RIvfsf=gDN_T=L_kah11P= z8ZR7G|Aj}Jlnl8nPQUQkEvp~0FQKTEyk5e}SrT?pL+%>77Bp#Wv&2rvC?p4n~Cbkxckp5?;! z(HFWT@gzfj7(OgAZIYcBFU~h?t@Sj;lg`G{ePPH*?!7~_U9l%&Di`T4tK@za&AD1+ zy>cW5d3rcMY1bptLbrfCv`r)B1=Rt-UOB4LRu~$bMyGMzK`t|D5ko}5F)PNt>n43+ ztRrs0j=dp63+S3pZZhLQzqgG6Ui5KvPqI=-!iTH=#Bd*lXA;A|92qpkyT!f^y!E`; zQn>yl>#VBMZ!>*_zgG7aKCj$C%$c1v%z5mLWaJPBXqUMbLiD}D<4VGe_U;S6;EdeI zt^mv14K2*W$poyjf%iZLo%h>w(OTm#(u?4L@y#v6UGpVVPrZbAgw)Ax%pUtyyPzC= zCU&;-NXy9`o-;(eL1j<)kn$%E%q)hMpEts(e{_0r%XL~TP`9_o9}qlA?|R8XHV5L< z9!JGSMEf6by0TE7p6{h4R%1aD&t>AzI{BLqOnb-9)Fj|;>hz$0Kx||fcH-H@=dnn( zD%mPE4%m0>MY*{X=%PRWAsN zbEX~T=qOL;1xIKT1xcYkYbzSaAzqe!&2EcPtijXBb5tD|#0W+vCY~LK`8j(s_AU9} zQ|FNghV)dR8U3vk&G7OS$Xt|deVvN{QfAk_`6ql?08s)x9_I%Vj)uY;H)0EYwAz4t zo}4M1Nmb?=yI!H%?(+Y5!dz}{A!Yv_iWn4suaxN7zBmUU)vW7@hEJkGihGFR^W2Uh zuH$d;(i9t_*mdVRE1S~+!>4WWzYM2G=Ieg_Y((N6QQ{1k2{-cy0d$eExmAET&g(md zws~ur%++TybDpFTt~*avJ4iL2Nm4Bzz?*M~McTz``}Wr}eHfA2mZ zEezmJ>tt~yoXo>wehCm|ptDs<*t(6qI7{%wni@~GHGn09D0P4Fn#ipv+07MKUAo!3 zHJWCrCP7;uD20aS87$})jA`8J#~IKD!fbFI8wP$paV8T74(HR&rG=jIY3E1U$^VQ0 ztv>z&`VaqGN)`5v2-mCBx)NFfeYgLJqj9y4e3re982amGWYX!p|5cIw6|B1oS|YRn zBqX;jEUf*tmLc%5^$BhOo@8iE-CGV>{Q(1olRM}yPnJ*!Ha-}D{&GI=tV^7-Np;bU zEIoa;lyskT)?R<#eI<}PSuIiPZP;^TI~!x?Z;?n7qzF&$Y4PAs$==m!}WCm(b#G` zhwkJClXshv?>E)~O6^sSyUZw)8@l?xni0A$vST6w)}C&1+?*TrZqoVR!ETbywB455 zG-C82H0IM=qq%1Sy>7G{iQL$*wAU=bfqc|HC87VRGZ|*>erO7#t?&k z@)bT!5j7YtUzUrC--oD+^e*|@-=S=URI73^9e4y57AwROF2FUa#A6d0kby&c8A=Kl zgvxP`&a(m$7&HP;XGiY9y&bb|i0BVEBNzHa6!EIUEG`qy-D zfIVzxslnf=ID4@|H-qGMRNC`BGjgUh-y|dRbV~&5yl}6_uU;kden+Sy|5OlB8=?(6I&Su<|(o=q(R7-f1O|Wo9hF zJW+CklI9P#aK{FY-!+(je|4Jz)W<5~9VE^?>isnNOQArBFy30Fs}nmqlKYtWAWB`|Hb1X0R#`2hJP@e2-l%PR8vGJDU5g-BeRg;$W>XYeEoDBP}H zAg|FzMTf;55%nd&xF;`<8f?IGH;_`qu3aG1DO>ziC2}0>$RhaA?;=6%?8ICT*)r|@ zcSzN9>=!!T|K|cpRtb9egR*&WxMR0d)1|GkA*EnKY#l2)M$p{ofpL_74|-Uztxb;; z>c+4jrokVD?K!a^t8wFAtXB?Dd+Q1*a0h25Ra+xM^yu`xq8+I(0rNyE7=QAM@65;4 z(WYs{YJ``MS$+jEZtO)Vvg@Rq45aPEM94zWiIpC@xsWD=1Qu$aCRfXdI!`U(ujQ5G zEk3g{uq}u!X|nPAORPl*OZHt?nn2hG=V^)_u?UE6`Jk7;cNIb&?1$W_0zhxIQH~x2 zsh4oil6z$3J81{&;U}h2)+l#}$CSm6aS$s!lIA12DT z2Y7UBVCDwmOIp3`&U1ux+4(>*Bh=GaIK7u3bAJOHNB+FFMmr*;r|15xVW4X`lvGtV zWM-EDf7P1MN3k)i9Z2LT_&G(;mG7V(EZy;n@UKLvamd%O4$|{ zdLu@I&vgd+A%ZlA=vXC6f*;#}BCDpbVY2|BV6RL&q^}qPY%5Yv zQr}2ICc%yABbq3HTV=yjC;qG*D<-fs%~DydHo-#j^pJhfzaT5nf9a^_rc^u6KoRcf z$>OV+AvhOW(s0)XtBOCn4x3C;rjEEar>8QObF&^f=`zQ`wUK445jqDZo!_PrwnT?t z)c^9((dNN6Z!8B5+v>?k(jw(i+#z=jio@8ElLhZrVM}G!zIEN$b)lE!20MTNwv^sE zFQ~JdF$yS?lIq0gLj%|xV+Y2Itb#mqu4>x~--O|Cz1=rt%1?!E5HV_-KO}*ZT5)aQ zT5)o{5t17IsF<^2ejNzNQF*F~2V>>5LY`PN_IrGFH~6V|qnxeEjwaCL(a}pIo=s0H z4_g;Mk>V_>m!6mbvRx$aQYpt*Mxh`7Id22A6JV_hD4Yy{(EQafUyqi`#(+y0I;put80iDt^e*nI*l?mdUKZU zyGDn^uW(}(+eyzrQAs~1{Sb4(9I4d)%%9l2)GBtuODP4%yPxdC;yzRSsx`RWI6)m& zhc~N?jN5kxei{3CqZUKi`oUMdj@;IG{Z#tA#LHMiUV;l65&xqZK+S|E!tqC8*rV6$~;BCuMC;RNfDcvRg~z*hF_YhT>hoU9LqkKv|N`mhMWpYM2h2r zSv}r4XvazlflP<-T(#p@Liq%=dws;qaW95diLZ?BK7!WTK%x5T&jQ|K(dSHVI262f za5)_Dbvzg0Rl3YMpZ~vPOQtG22Cs#G zHnqTGghgcf&vy8ZW7(vHZTwdG#VUeSpHlLt6uRG{I*wn?QaoLu0pCdR-hb5{3% z5yYq3t}FmpyJZ^ZAafVaYc|4S@RWLplp|*!1-yD}W7@-WKWdjbW!+e(o$kg(c)n_8 zeYj(S@Qcwf2XO#kDJuyviGf22z%F#pdcK&9B|d0~L^0J$uHG+^wTd*Bbx|jOU$&Q5 z3c)wFLn#Bf34lSTEMakvCBV83cmilIVXvchb*INl$>I_cm6T$?1n!FAXyYBr=v3tP zz1o_SOq_yV&^!I@0`_}~OD=8+Q{8q;sr4~;Sx_hQfUTPY$rcBea*Ma zy99P=4C~-$qKsgvxiGOfxRTo1FWSL-d_4XMtj4g~EruAAUk7jW2sQBQ9gKhcI|XKQ zEG_NP567K#?j~-1xOsPW(>O_3eNTk~uQJ9;K{eGY?75&boBx(~Sn2kKpx=h|?+EaV zd2|sS5C4ArD+B*>Af*R)cT@qYCLbU112c=X-c5?@X8i5GOJm8c8XswKEiACV+ou_v#!I|%PZ)H)~T+NEaw%M~T zgFp_4p*~ZcxBt3hiN8|RPyMp*f(zA=<66gYUq75=I3TC60)iXPzO8~UWnwQZt%CKN z&}A}2r!BxfWAj)Y2xmU};z-WJ@R(woF4k=LEhR3Ch-#&eWTzVoB|{6t*Qx8<^8;Ls zPv6N`F4XJI^Sz&c`ON&hiY@f>+aJ*$&$XYveeZ{CeVNQZ=`RraezPWcj+!copZP~j z@JgygnsIe%>mrY0!!^R!BN0fF#$?(!bNY&`1N>=U<)!pKsXmX&hvujHyDcGa{QcGk zE|Y#HYdOn)?CZHyISBYPZ4lK^x(m{Z^QZ}IFhtZ3lJ!6b?OQ*!p8YdIfJ}>WPX;uz z>-!qmXJ-%D~&j8pkdgm1YFC9F&R?4xvY zb>ArSyv~hvYY-AX-8zce`pQPwP=H>OrRX-B*k| zQ`aH1H)p8Gx+8fl%at1Li9V02o;;A7Rfds3i;KbrS)R~DTv z5`;^yrR>KyT3tZOH6EWiyGF(}e!S*Balh&GnPpz9SGi{r`SqSY@+^#)wMmnC&^u%o zOT5z*iXPU_{iSW1tt{z}EuBlX-Wip8@8c?49(;2&$jncKvq`0h$dzJ${|Z&y&KY|4 z^px0E7=1Io+O;+HUYFi9$h~#Y9siF8rq_nd)I*oy zWuBVgX%FV}1^X)A*1_6iHI$JAfBWlGiIVe^GC#@Pvi4n%_W~79IA$52U4qwS)jVdU zvwW>;Sf-I4h>;!WeVk{P%==77+;iF;y1CCsaB1yH$iWZZ;OgL{7N4u>c$YAN7hvkwubxS-XA}?7z)%5dM>jpk3#}8ZwjF z-NVuL#nE+~FeMzw*}{2-xYE#j|JJ&~TUA`hl-DfA)pT|+>ukaktgpYjcNSQ?f7US# zMMgn|=QMqAqEA_4D=#$7`$n^EvyWl!5abR`CJFr;L(40yBSY>!_has62DtmpVtXSXM$W7tDPf`fQ&`*L9K} zgcPzH{jqCLO`sEIC_p*kRervydztn4Wl~?ZG}thfqJZEq!!hfj zvfs-L(T}k^|7M5L$iwdF#BUFk30@|0{4pze*?LbA3mb3kjpFSMJ#COm5agdT!8N1H z$%sVX5Phw)E#AEs57wRS?iv7J!xsJAe$R5cf2c-sp)liO~8*~)Q(+zfi+*i^gqCtVv7$DMma z07v!7H(sV>KJl+}m^iOGg+p#=_xy}KFl9}Tf57`(TTN-E#4E`P{8}@D!oEvaoNt~s zDOK9fPwdZDU%Twj1EOHmo53X5Y)4u9nULHlkHW_)%JHBmqrgiY3MN^$fy5`NOi~Bm zFvwUAA*>7m`&jm?$q zARB6H#QsLUc9e3^KESvp9)c=>pS3%~`HkwGM39s=V&w_{TBTa!SrU8DsJ6{nv)El7 z#ADB$_ipgk%UFA-bgQ=-EgL$t>md&(kYh|zZrx~P>CMgTuCw=2VARHFHqSQ7qUC+Y zvj9Ft#SpDb&U}2TizF%4>B&rrh#YRF0xKL5)ZUDaX@wD|pcHTY(W#S{af`_d6(|Tt z^|XRci>w?B_Y$Z$&k!cgO4EM4e7`vj$)MpZ2~ps;PBrSE@)+L8M4kL`0;6NDC;^ zAs`^VsDPAEq#HW$;-)h`Q-X!%97oiUvr(w0MhS#DbH zYU;WRTN#zh6F{-AO0(7xlHA>}^0=lx-T0yZMUG#ilmPg;N1-l1bh(q`mzLaPQ98Bt z=$6*;_gaKUtVM4+sc@UwDW*o(neG(J)K(OOPTX1O*49tm58LOCWc)%x2iv|Qij5rS zdN_yko&{lkGWuP>Ct>vlK02;Qo!nK9<_U;sS#dFr=!rudhXPck&_%YZWOxV0#nLlZ z|G4_9Tzo=m!)NGGTCnt^Rm?-G|Gmj)*7(ou{`db_*MKE0lN-5_($dY(USz8OH^-ej zfzMSUlDj07%lZ~(j|fb`QUL5%dRywff8N+X?(WY&r0Gbns!tEf zaR4HxkSYK3Yd?QYe3WO}K>=uo8p8+cn`is>=V$)oix;=f^qe8y8$aa%`S9a&r~31q z`ufEC>`);*B9JXz?)$$wJ;}R|z#W5ZE%8ZVV|-dQ}`@g8!4FSs#({&H02R=wwSgWOv)I`k+Pe zmrg*M_47a6Ui9#D-?J?U*Aj*4%lrr4hBY)4@W~riD{G_GgZcDlo`>~6Sa$pMKb%g} z-%jVh@&A8&eL!c;wn`l@08i{veDX-^-{Hf#ee?M{1fScDKe6?{L-0Ag^Jj|m{~!i> z!l3?oX@FhS>*R61s78hYk7D@kk3fF2+$7_7xqC)U(l%vnT$O=cb$MuSuiW_N{|SgU z(wiDjPq-TGaEh)+70-g1ByV+wbHqqM*CQsd6I^atpY&2NLpc#8xb}6kQx=c8q0X6b zkm;+=cnlA8`X_$>o&m2(!+lz0N(g1xh;Yrx#$0#IY*T}jS^M79>~|k{xG2Q6Gfc43 ztS3_TEZ0xp4x>M>eReRP09xKIwJ9&Q!;?`q`710!hmvt6W-%98I9V&v@d9XcYV_$i z!k}l{@L++Zl-JR%P$Ha54NaFMbEr3EJI3mFg(-9}8>~38>P8Q*fQ3uS7g`?5Ag@`$ z%NkvGI$l)2`RO|=v>djO3=6-h`ARYJZkQk>a>YF#Jzf^wqDZB@HOrabn~E^-!s%5o z9D^kr=TV+<`Web8Hy8PvU0W>lXn7K2Bx_x}keKn=o+V93)MRyUO(rnwfnytcnti^w!MoOf)g@}Dm^Lx!u0?a;Aaxq*lGl-nysoI{f%IIMF<-u+c~ zw4`6%kE%FokA}zvPt_xwpM7Tk%CPSm7HDBL(#hFSqCglq&VGLcMcC`$V8{A>$ML%P zPdKecvXlb}IDBb3E}sYS8|6(UvQFk-7hfd9Qvs^yne~V*>##AL4J=|UoC=-jXfAPY zrBVyJW9hsvCG{FISvM8sL1 z_Cflwlhc@JjbcYt#AIPv2u)u^!{k`?Kv|=RZL^Y2ywi+4+ne+hhaR+KZDQwM4e@nn zj9^PuSV}jTtb!`V+=|6n;gq=qgX&GrR*n)2iu9g?51==IzWv`j8ewB-gcgaxP-n%(_yE};Ws;zW$8oTl=#x0@! zpq`vXOt-6kM<=ZUo%vVVV$lD%9_D7YFk?-is=9zj4 zNR48p0tG!3!hpSAyz;U;e>>FM`1bOM+ZTzg#Tlg82<5pSW5v6)*kP0IM-fUN(Jp^j zNGfG8vC6jSZ^>`8w~FReoyx!2ekwtO`7G0}5`9FJveDxr)9{pt+GZbCU*>t(tI+<2 zy=-|@vr9YwS9!Ne0HVa%`90?~xi}y8M{4t<4ySKJ6k^bC*-hu~2M_rVf1Htfr=?x9 zLm$*PeAS7t(LY_*T8D?b*7ayjh19CvF-YtcpQ1us=d6d1B?0M3z7dB}E!KZW4tayt zJeVn9|0_V?1x)}C3JlwFSsBnv=F%3@a95$?p?eC3ZcW;=2tW>mplRwy#ws^co94}l z&O4f`RfYIm(L9l8BmMR-mn9Fb#8?D0(k;aXb29{>o36O@lkvBj1WI(Fx7fEc)le0m zX-wz0N$-Ad`rx+QxU3gUe1)*K2;02#eqQTJ^2Kof^EtLiS<{i8W^eYf4E)-`tXF{R zQ^siB^2C?L$IM&xJ$EDoik-Ci!Fy^(&qHY z@Q9+HylIqUP`?^z(H_jywh=%{3jyhsDy<>I*mUT zp)|ON(>w2^@k;t{`;{R=6kq)WWBf0-vn>B|N!OPb|1y>F{1ln)CT>SC0*Pp076LtQ=fn;8l|_YT?PNr6j!i5c9actv4uV zQK<`fP4umV2e5hip?c@ZJJrW~;ysR8DsZc3a>>?@$EV}c*n9fvFRx8;NQza2rUY_Q zCMvIhCFe3pDcSD6Q60}t$!qH!@BBu@91+10+}#yI^AYSfZD^dMJYdUVW3u-L9f`^ zM^JGDz8i3x{~&H8jSY_w?2@Y1I@`nK&~Ll7A12^AbamzGcDcD9bDLr0Gb1b04_BOt z_ygk~8aP{a#`3VIbgtJ}?mCApWb{q`GV!iS%Df_BK%|H{0;TOG1pEy_CmO1CaXno=F zZh}yZzRTob4>X8cWMGj6v|)0}$9MOU{eHGml6C`nzuKrm$3)3K1G_rZg-qu%K5pVY zsYE6|?u#py!Jcfrj3af`fJK*kEVx2PLS|U%jZblVakM%h->`n{T83PC?m)HaMF;eTNy9HwtI?{D7M@(iwfvc1 zA?0iz0)|-MHOu&Q#yZRt+%H6NkX{eR`cZ_HSq;_)R8N`$ z_>E<}YQQvRY<#Jy;QBjT?%({e5T*xJ_Bwie7B%IZ#2#NN;F~lf zH+RWlbjzExdZpf)y<%amhUxgVR3lV_M(%3&o+m{1P!IW()VfYv?M4H9?oI<}(|EHr zyC8o)PlrW0r46A`5zA%L_}O%>^Q&yHkcG5QFwe$VLXdgS23}zpP2GWpk4XRUR=Qdv zzwa%oT4W}J4`y5a#8==Y?1>3CsC%r`BYVglC#%DRW)OW!O5@%YmBldJbv@?zaBpnM zhmDMIWIRtQ|DwlkQh7)&d{{ktd*Sqf{OD`&RCWs2F|7DDqYs2i{q@$YVvbg3{L$Xw z`qQp@;PEa3*0Ur|v-rJqgWkTTUME(G*}1C|)m&ZMW?ivQv((~li#pAhZBwd0*DUh= zU49}yn%Us4Fqw;UQ(mXSE2)nbe4JdDgO?)OIu~CMthy5uGcQWSW8$k*?C3X}$)Ss3 z@)*j)@23Wh6P!8bmfx9_w_DJ;Gm76QWQYoJ|g6CW(su7^lE{yc&zM)1`0w^-1i zB1HG_P75rP%IrqcG=&!}FCxRP^B6wlb;*l%G0Nc5Jh^T)SY7Ethd6J}p%)09M{y;s zWYy@8m#-ndwO-^&ftToyRa(v}xq=GT$0&<7Po^6LAj^b0%kdNmy`PVbKDNnWx-!jC z!TyKP5Ghr~AY36HkwKsYlLvW`JU7Ye6qJa1XL&*iWL(j~qb;xZ zos+&UdW$_dP>6l%(<$7-BB-x~$EfDEd0^9iV_+DnAk!TqGFRZHZ;&vbxA!P7#tR)q z^U(3}^ZIgf?rsR@)}mzLEUi+EpvuLX0bkoiXSmFPn;Ry%`b`jk@_VD&+Jd-W4-=3& zw|MI#hQknzA7i^^n6@72daw_h0)TSs*bAl!J*vYu6Rsr(tHqUALX zMZ*urq1MA+c&Z)e1S(TtX`!>V*sh|xXSE*ehbQFg8V!beoVuyrhI|cGWr@5Efp2nU z`vx1;T3YGwNo|a^e@~W8$QETq>O1CO__tnHsnzRd(e@^GCz_`yd_^WrozL~w@{CUd*XbB0SkB4I}2ObkAQ z)Rv9r=&jvgM#?-Ug9p*OE80IU`81F31>e=q@_ab9h^kxYqI~G$3(0Q(01o(mfN?)V zAEnr31^jN57X&Rc)PfN4yor_EFS4vDe@CUxXCti^C+<|@;t#DWHT3W+rg~Fqgte0P zaY63iSMD~Ju+Ji2oz&C8$jZCjeVUs{IpmpOb8KEpF0dzl_)XMtKKha%O|4uW38Uhz zQ4%8KK@Q95!%@!d)$NN0qF9v4yuvl)1(GC?dUnWjMO}Oy$yC;jJ6@H@qzoja+~^ax zL1mce_WZb&seDKzt@4we0>@^5t*!{(Tj5qCJ62myKDT3zYSNynV5JgGkjeE_<%1I4089##43Sh3N$Y&O5-L~jT{q78dC&dZ!b(u;hYqz}wZ!r# zj`K*!JYbgj5fCr8mQ-}%)0KR<4`M8u$>Xw+@3>%OgFi{sq(vN)vZ zof9=N3NJXQDU84cmEQ_s*Nn~&Hq5_6=#$5+zM;~;VAhAO&x!(Dr8eV>9e6wSKP6X+ z;jKcD;O>;*ji2!T z`uS{Ix+w;KY+(icxjNsVeBzlD0>^Z5XZ5Z!O;5$P?5Cs^PgsX53ltS2+q z5+>c+`L|T#E3q~SE_pG^F%Qz9R7n?S73=U8Ua27d@zn4O*wv>n~2lu<4;sgzsmN}lXd zm40;LM#A}Mv|(>@gy5I~myv(85=Zb$|fG z(3xOgzoN7~MnZnq-@3&+P7}n3Pdd>|%q4aB=&YL_k%kdUYWO+%q1;F$)@7lnSaU8+ zelk6H{30~pQVM+Ngp`%HV*n6c+na9nFolTp|nc1>y68co=HKhduRNDHGd(jpC}CW(6tKo z@Kx1H><{G;dQF_ULuPRpA7P=jbaTa&X9N|y^0>zhL-{9@t2cE=t%q zc%smU$97w9Yc}kjXa~&&Wi(93mjsyzVn;-J zjx_jZG!u%kKL(!Jm7t_s-I+~k52@HbAac5;hh^Mvpys6EPpTVH zd^=N)V0dVrx>ia<5o`Iq>^=D!YEntSB;DExu_Qv5)YKX?{oqWTB2%7RygG%}>9$9| z1UqAIz6JaW&g|G7TCm38Ixx@D5E03#89)A+m>Pzi#BOOvWIZLO9Od8xg!tOJDEPd- zlkBTWzrEz{V#l4QbKl|F+9pRznDJY543N_~W*RU!WvNB}Y)gzlvX2f@ZNCDCeI{^! zSgKT1SE$0wP|g+Os%kn7d)WcmSF4{+-!?)!Yqku6Ecy~3WXZfSI7{4C$Tb%VLRUo{ zYX(No5pNfK7<8Vm@JcDy>+{y4y-|bn@jojI3}%CFTz}lP(q(2-1{qNYnKX`oM-q0n zqY;&rNnDtO#yR&~$=o{kL>(#l!9##!aA%VkYK6oTP|Pz+Is}__g$ob8ynP3cNPn^9 zy_4BNhC_*49}h}YXfaJMfyZ(47g4`A`GNpsQ@>BFwfufTJAlj+vjyz-DkI}vmTL$0 zR>xitmZBU~O+WkK0eFVNY77S?4$&V?#M-YIFf|uMz+#fQ(*$#V7J1zHcGW5pl##VOM>4mFUtFJ z|7f}?nO>!Jb)K;rEfid{$3!I*D=ldMPo7Z~?FYt*z*B;9EOl|$>m+dYO`iVq2Dn#mf;!il}wGVF9al_?ZLP+}o0qKs}=?#&qAwbDRK7>&~wZyEpd+iF3iGaz0vn>RrYCpkyI1BNJ zbn$5S;&R~3Gmj9N&Eq|_pv_r0SRAyM$Zky7`$SGAAU!5StYov z(8(|wcBhfcNS@&P3)ZA=&KZ2PG9K*){`S&FC`;fCYnEsZdr=+#o$oj*i2(hczX$T@gBGz)K@raY5;dIa{PReiG>A zA%GZK*_9%7nc+u@hmR~EetXI*TAVYaMQ<9+^p+;Le#v>lqBG&Tm~Q<5G1$XFN&S>^ z`__T?VOcL1N9@0x6>8#S<@p~vziy10Ty;Zxn9N!&VKI~3!(2}$#D@veNMErGG}iW{ z2oLk<=*-_gygd-C6F#@*{&k(HN4Og#41*k(Yn&kLnqpB|UkY(7SLPR2^O z#5*q!41IoCjO4-;%wdaGC<>vK+(o z;A14EZQx_MA3ys#uVEun?x|F>gX5o$MmGb0}P4;-|Y~}+nvT^2%MOgM^#}Knr zkyF~jqsMkNqAw51n^K7+-yHZxg$sLJE@fQtnu3=;MdzI6mRmJ0_uAKAVqX5Wi)MjU zdG9a@vNeKH$Q7r~l{z{ozp-0|`v)it zzUm3xV-g>Sjuaf|I?nf4+1roqN7Wf48WJTLD1XRE`}BVMHv=cbgX3qE@&{%+d8*~T z^r3ajcU~zs%(O22398Sc`2Iq0%4)JRar~)YI5~_;ZsAj6wPpEjcs!Gpb#&<|L{DG1C_9ZGN zYeyhgK&JKVKD}nqzNna@Kot5L_uO#pq0>QRoyLJ4Dq=~$6R7ib%izd9fviAZ)M!Od z7NYzd3q<%_(4?KoY`;LUVBCBifBtb`Zn<$~NdSu4hKNj(d-sNjpuG2BHZq1g9ua}=VZ3p=#=A1fp&!g0!Ky@4xm~VG1ph5?5Wb* zD+=TX#)--R-2MnIj(O|Bi$FKYkxAnkh7B$^@41r$06b1UuJKDM)43O^EhAL|Mr_SaeElg2 z?1URgK1m+t8y+pB!RiK%ee6+Ld666yTf_|U&94Ub4pgH&wqTtx zhJLQ=-ydrubbl!YSbq~u^A? zs*cH1>b@-LJ5M5<%9`8c|NQ+G77<@~OaK+$QKb1BfezazUqfyo-^xB#72(*CbSHF=$1;mXneD13c z#ghPz!1v!c0$Th{CiPSq4_E8qmfZ|+k;vFT3;RLOYl8w_eZF{|oLRV&v7WW4NI8b5 zGxAcpggh%r55!i)y|Q4DNCvL-;WxKVmDSYZ!1j2VL&BsapMUs~7fN&M#(nkAEt-Yd z{kw{Xptth1MQ4Sops6oBKtAuBdC_h$Xzmf4)DGiK?z1e=!?@8kb7yU!ZH~tBg)R0$ zRw>ufl(VaL{I+mI*FdW8iKN(*8$L}?o`srZCUAYkGVOi)<4HMHJt1p zi?p?V{k*j@pCYl6LHpi$t@}VVJ|MdDH~RjPiL_q|%jwDUACSNankrz}UTpV+X^#F9I4|kU@p}yXJ z-OkC`f4$*{WwxkJ@_j?C78+}se816yIKI^MowU>=J_aw75z9S0oLsZy6T@Hc+l~~X*OKQ zYVf`LL)>}R9B9`tY=CtxR@y`#d#y&Dm2(R?HI?qnyn>-&SmjUDBCAK^;%6A~>&TO=!xLv!31R9E7g=|&5tI;Irce6WH^_)5gSafge(d#`i zvXS^zdY{RBx~ZYQ1k%1fU*E5ySJGu9>Ubk1VEp9-b+%ekB&N|;cnqGfsi*X!q1_O= z)s@s%uSG+cMpZSBNA&dB2RJ3r$^S&R=)-S$_V2v$9}w&{2I%sw+MB9LS1A=aa7TOlR*^bN{ANG@b=3{sHJ`62Ou?f93u^Kl8a2 z7x4RmzU8aj{kxsO6v?i`LjHfSMCWWEVTr@%?+J`QmG$2f7=NWhe@|fi#SZ;XCoukh z9|wRRwOkVQJgT~_81*e%IODH}oi5;C(gVi3SSkyNuP26!!wh9|{{wFvs!l{zFuAkR zcxktHv*CQ;IoCS%8xJSV>UuV9hYU*pPmWgjj{osQIGXVIGSulWB>hhha@7KZ0ND)n*d~gByQi46y)%Rpr58CRKa5nT3@p9-iFW2(3%nYE2YzdQasq zU%UKBVL|!Yha2}5E_1)!zV${y?ws!plLxB#NkLa~4IX~|lEuVRQE{$d;45d4u?u(5 zg@_0zJA;+xm}~bV5wjj!2#0ypX)JC&)&aIwO@NoA#uPrAsEaqFNHk-Y_-f@XM~2l? z5hgrD0zo#uGLw2^89)CAyibX{*x8+{FVlyzAa`@TcTfAjaNpqc#be^X277a}*7J?` zc|7>_=9vq4*Gxl8W1ldoO*&m$Zcg;M)}L)3mR^z6_71Mn|8M>EqLE z5KwKDx#7p9!nb^?)Nsm!b8Ai%vx26WwxFlf`O#INb|gh&tR7+v zdyt!%a-wha?xrJEzx#+0aJ^~Z+?>Msc*e?IyNRFA|0AlaD!oxAW=cu^P4mS$(wxwn z>tsyAFUuZhUk+w0661-d5D~(@R}u@6Xp|;vyyU66=&d=&X5V}88B+3DIz2>;(v+PA ztmG$}t{QgpLO$D%CxIjTZ)9SMzE~Odgud6OP7AENVqC6GLzCXu&@4F1Z0EDRzh;F)CaL!EbiUuCE0gPJCFW!OP-o|6c7swQsEA z`e1=cl0~20@rI2`#LX62S229Y2gjk`kDhxX8AmR@sF;3L#;YFCEJ`GD4pK%N8xVAe zjv8fdq21PtAm6>bih8u(OGZ5HSj0X#)hG4vYvA2}qj}j?%X-@yH#F$> z{5&E8aoy_nV_jp#e4hcGmGjOFmfvacX(jycL9GX_lQS6Fg9DdRIGK5X{9rlP z60TwTbn@wZ*XrhzQ7xu8orIxxRTc$0_VSv}_%TE%Dq~@zyN(QP&Q5ktm+tQzKS7Nd z){%H>UDF(VS6qXpYpq@E&a%=tTZCYOVtzi_FpnxCQQ7o^y3;oIQzQn5AsPSFFC#& zZ+|dz$ccyf$z5j^xTNFzk=Zg-;km@qpre;`k0DI+A4q0|H}Mu9pFA8(qEY}mJt6D4 zNbh5>%0u&tEg`&<%TZj{m*=ro0=fCU3QFtyDhZUG%jWC{bgO8*)K~d@4y}aK&m_5M z6zc@|#e_*Y>zealWHVU$V{mu--tIiZ+TLpaF<3^X{~hXji|0q$^w#>UE32W8$VA$D z);>WM<*8Bv**n7E^YC-=d>=v|eU`by{E^K0I>~hiy1M6)t1` zP1#SGO-W7J8DHACo*(k?7Ihe%d`+@>*h}^McbTnOR$1w98!$V47z?}CX!cyo{<2sE z7b|6WOOr>VY?D@_N)zK9-J6zyBCOH3zO;VHPF_hCQsa42afdN|NL??J_ui{$uz_4o zLZ6z78uqD?g}J%5xvRyG7ji-r=IZ7v{i10b_owNV1Lp2ecXpX83|$$r8L}I)9^w>7 z+JH$D(z=KiI2Vey!?zE&OQV&PVaheBat<|S5oSJfIga>sqTZyBc<(Fe_m(M1d?t>zDP4&jDciucUp`@K@ZUHP4aeb*Pc`UVpnp3Yw< zGtm`rQ(|_w?!dk!RHzu4Zkg56-`2_9eQa22P^zUOi&5s4ho>4wj9ssWP}CPdH+0;x zBeDsyX|wkZ*bej&`6mL$?#G-5aN@_*=c(RYRk*rKtuNvVZxkjLZoQ{(Jz(yNxh!JO z7kDokT2!}C^aOKf^odlGZ<2y6J=IPS|BIDpbTG+b*8sa1yZE;m^676|pC>iRhcl>s zvUq6`yeQtClsamrBq%NTc2K;~#0-_cGobIxQ281WIBIEQ#bA~zG;T9ybMn2m?^frR z0ps{OkYXllgFv@E(lpC7FCwb z39k2C59ZMyY%>oR8OqaS6`pAm)(hDC?Nr6wErzPR!MbYl(&6kbvv8w`w&5MrUHy{f>0K;AOD=k#)7} zMQ`@sG`|V>K=NU;agK1ocb(9ZedK*XaM+64L)=9mk zr4?acvyv3_dT|uD-%t38kdthUWr+1InO-=RyoG$RJUq-Nte?dNLMLdq_9KYnF-O?P zRQ0DXo-XC~)X0Ai?^9V)a8TitujC(e-fV4}Poq+9im`uw*Cg}V(s#!P?)C?9+aa&b zWr`Dced6;ulC&DhY_3UBUU<{@wxX_P>fTgZR85qnK)fl=ET?U$eHX)nS^Kcg5)l?D z1{KqD7@K)Kzt+BrQEVTMRB2biNOSdmm{U+u7DNg==mxYt(Y&=By9PoWZyUP#dPbgm zqNFKAzEI0}ES!Seq@DG7fU3=`Efowiql?V|sfia_e?n)Q7RFKd-9(7c#&gGxiZ^(+ z*NTzO+0Wa=2eh_)?iU}oKbgWA5Ivym;Y#LdHQ0B^SKNr6LQ$~tzA&^ZaQ!|dJMQbH zru9YV7H@g^4_H%Q%Kp=)u3gjVPuBXolbb~2!N(<2!zbiXoI_gf z*&_vkWeOEb^)F6)?^zc0-^_6z8y%?MeDh&5x&3!L8twt8(aU|FQa&`u5b1sln*U0KFJQ zH0kloQTH0f)~C$d?y{N;DGUalMW@=G`8Cr8(;)T&h~x|8ko)}1kMMp-UEB+4db@*> zs8B(6j6H@yV%{TnwrUTDrWB>5;dR%yGIxMWY!@D40){|Cv^;KxFDh1?%F9hW8#*~p zv>^IdE8OR3PKWB2pY%)Eu#A;C39g#Wb9d&zO1ErKM=9%(q%IUAtRvjjX-;`ZH5@|xSjihw?kCzC`zHNU?Az?KDs6FnQVxi|F8VG6u2df71d0Tm zcBpMu?qXvM2deisLS9M_xwG#&SARsUoI2n*(OQ+K_J>);ys6le<)ePav5z(PJoUDM zx7Jn;XZBR2TleJ-v^Sbg)VAi|U|bHj4i8Pwl|P%%!XwB`Qf9;RUc2}*?i?Oj9BoE; z&pi6o>vT*J!Q~_NY$-6wO;Wt2Noj-?=}~1l;kOec-oSmtV;VeuN4y(Vc;YrAw)EmH zS7RkK{4QfI4u6p{oq@gdR_lRvrrPpm%F1|m zf&I&P=kaOqE&zM@z*h>N_Fwyt@$cXf{Biso9$tV2-uXYr29}oxwrVE!rlvLy=5WVB!Cr0P0HLj%jsqSZ z1?$-tU;Zh}I&lAf3w3QrZDl2K6Sy@G^f}zvl*iTD_Ut}*lCI*wuC=Knl*!fF%Em$5 z6?E&57UIDE*WU=qza~xUN=o>L*T6i|Sv*ux z2eyEgoqg~zz@Iz+*aG|bq=A}#L0fou_wnQ(%BZ{IuZ&)(QrDQew4RjvI&gHfy1t^K z!r6t1g6i4pXC?{hOwST>Z{59R{F;6MIO&|2ORxQG`ZR7sI?`>g%>&Lr%qNKTa2YH| zU~q8~E@Ln()}j5TH=7#Yb*B3-@$k=`zkKb#?j*dvkb2EAbE5Bw(Er)?kNer5dkwKU z|M%M7HN4-EF%=_+Px#+E@n=V`Iogx{t%gjh8$d6}#~)si`#WQN`7$Bg<8L*@dlg6t zNMoPv>HWVq!t=-xg8xgz|9G-`)yo96I*||OZv5Bz_-8kn7G@a!R>S)VQy0!7@7BD% z8TvP-8@Rm;``+Jcc#fQxX+eEMIr-(^8{uW~@=O1Rw*5J}bXR#*%O8vIV z|MX<{e_Q3BqWBFk{)#<*1B{=@>Nmjn4KRL6;5WedDS`hV0Y>Kgb0@g@_{sOB(w8>` zAzK&;*k?QcMav|64c(w|uR~DP@tCZ|BBhS`7@zgB=3+tA(DnV5Ab71vQjH>~P)n{g|-6+rjpdDr_Y z8U2vEOHqcd9Tu{__fCr+ROl}$dE!ob`^P_3c*q|r^Vi*k?hcx@uS&RYt4!{c%BD87 z-J7>6x0S1?RZ2RSZ(TCD+Q?_ts^f}&(WWz&Ov`wg+7#f{%s#%uV zi8Dq?GU3NYRNf+tv4by&14bPzjO2o|P8Nz=TNaeHr^UqiA+J z*v!**k^p-)GPgNU1ZVY^N%sg`Y+QAp+pDN%fNba%+>bvu7K$*zP@`HVs2U)}Cc zrvqq`su94^)W`=JIm{oc_$M6@M>7R{KNS-=^5~uIb~uv`xu9BZf*Re4O#`T5hgc)` z%9#7?VmUKO^)m+V&P24$6(61|9?*BTy21I1|EQ49RR%HU$q^E;>` zM;s1;wpZj1GvXSaSCKxwQDQz0dPk#C>Qa>wsV`b&b5HrrIBK9pQ~5lyUG6nRr1z@0 zvdC!BWUeKKD?>JQAf~%sf=(Ty(2=pWDk|nJJM9ANrYFhOOEBYE9nQBN@P^BznH3Bo ze>u+kOt+a9KH}$7y#^LY8PR#!DAFSG-;IWUB}^7W#vgJwQ{wF_l#%uNvny zkm8W{k*Q)t=%{DC6r~)I%h}NSH4A$g7VGiIJ#*A~h^s%Q&9S=#(vLX6^+R`cU{};c zOx~1kPh5!>%a`lmpukS>NCbdf1~Z$&#=J1oiSvX$%d6}Q>X8pSGIAS|^>36RKMv(S z`lWW7>jP6`7f(@skJeHAF7=qjEP15Or@9AQ7!7t+elepC;j9U6^LhIV$dmo)koC1j z6`BZ-{kN30V&FXGJ+Y&gqqQhaLtz6R;llk}TeI#A2Wy9&ljV_yDlSppW97ABLEND{ zF&sR$su=vq$%Z%;5fXX-aQ&QTqD_uqqFAxn?3UVy{_%*uDr_ebEgX}BoNU0} zV=w)@JFg*5@@(^1^Ag##9AN`a#}T)niVIf87VU#D5P801m-rC5rT z0P^V5;M)5>aazY_5^(>*_p4U1vApKAcb*0r833xv5xOV2?^}v^&8z?M#edAz3;os&e(s?z-CcU`bd<`s&M%FqN;^MyO?F!PGC1$$aF;~ z+>`P#;t;%vJzO6pO+u@y zL8yn6q^&>0)&|i>TdiW>!OC@U4^Q@*<^ek_bs2TK;i2vR)(>4v)H=FisAa4peK-}f zzL91$*DRqYaI;t6q4maGtVv(Ti|CA)(IX?Tdv~k0PEKpt;}PU@v4~G!*I#e>x`m5} zsKEL1gBiV!bl#vpzh~HT$kpF!2gz!)h|LdUk9k*$v@k`UJ`jmp*| zW!I6sL>p+{>V1hN^tcdHK20*tX@%N3@T*knIHHMhAu*{x`Y6%BY%m-E8iEgIZ`EPf zfFVXz!fP0I5NmNdo*ypMDhWcyv$vE2kwiWaMXr)(_8}PRHShhwkrY^zX}xWKF##>{ zhgG);!b!i#@z&L0o(=Db&iO7pU5D2Y)bXwd6|OT)9&67SAc`8O-ST@x8J*WTmK%VDb1KW7QGJH)Vl|uhnbazz0%W2BkvYO&eH{LLl;v)b z0o$&U*fLqm0bvBBuF>>h2bO%1L<4Gl+~UT+sa zjlNbo>vMULq^Xsm(kfzGJs%wY{R`D&8u6n!N7u|+hnaaTU9YCEy=<2LJN+lH&(@!! zHnWxy^-S$Aw{owzjO{(|6sRL`9a zgOzRU2B3RI#$t9qeaum@N~6*qvPEjt0*;{F1`=ta`XXtSQ~`W%L&8pN)AMsE#~7H_ z#^u@C%|V!=6?VlD<3xbomcO598wmX*S~Nz#CuLYBjtr?*S*>pAgQg`L&=QT2d+gSI z)uVgO>HpTyZ56p*%L$)-5PgOecbC3un2*~`Z;Q7@HC;qb?PY$I&6kp0r7aotM7pPW zigAD&a;MKd5Mevv-xs<|=vQ=Dd=O5YyG}C;Xx= z4=_Be0rneF`_=$t~fD zB3pY#CB~L4uVzh1hXpA^+ieP!e(C$tF923v*St=kA8LOQ*-mHgR5lZ+7o(B1%p)eo zRnvI%D5R;6m4w-8cE^QswA6N1+%lIAB2{Af*{fno=4}uU>Ym$m52s1y_w5bqb*%Gr zT8{BBBhTC0l6JN}yxc^jfz?Di(z$v($?9b_wo(^0f+bI4zf8u}9v5sy&^X@RnLYVL zPeO!y*%h6YN0qAbfX798UA-jQ=~JYk{iN+=6bKz7pTNA^5k{)rGnny=*(v;Q>S zPnp71()qJCxZgB>xR7Pf0g6|e_=Ul4!Uoz;FKlh@U~>#CYdZHm#-RnNI$%k90h7hrhDmI@9ocIo?!ePLyuy`v->emUxdjPCc7gW}%34Cr8s?yAW47`9e ze#T5fziTN#_k@mtXIM?a^Q~81XpPf30=M^82Aj)@xDjzTnE_t6C!diFUQ>0p9Z%9= zS^=v2A2B&fK=pBaW- z9^h&X6-TFz*$u*s zMno9TXO}t@d>N5RvqC@?5%&iieR{hTVIB;XORvf7!3v*f9L2F=w6$)OA##|=__vek znoWDz0}f>w*fn{1jC4OswV!F)KCO^!7j54qdALPgYW-`-tY>{BO$mo)J)waJ$7@I% z_w=y{$(2GjkR*9bE^6m{2QT4cv$UgevUtxEjcTl#*EoP_H|IHb3iA(c9$JS~>v5x2 zrl8-Sr#I%Ia8f@Qx(dcOEP2E^L>?Mtn0{sax1jDXEIbY1^)eG9*zBkac^ju~A1m_a z+ZHP1Yj!2IuM`H;X6E!z7TL*_SB4q73hQ8vnBz=k*1O4SQYk_$eHSeOu0Al%z&_qZUy!L)uz$s=&$?jx zyVGm_V-GB2pSBZD=<79?6ZamMNnGT(lKVyK56e0k+8rGJ!Nyop-1-Agd=I5P2sG$@ zU%DWR2C+(P)HZ{Yb(e>{*N``szP5(Gw2e14-Vc=gBZ7aS3!B~!t(*NMNAHcB?bml* zs!uKrIt)wD@XD-9Q1iTAwU?_G5D~tPej3C}^%${d!gIc7d1eZ9d{lfY+;pNiw|mM) z{eY7M-??^?az4sFzHTk341kl5{YefPBG+iqaD@AcUf!i#s*$&*uFU1Ko85jeda{%9 zsV-dI#=*P29-5fd7=~ffkktqMpy8C3_igH%2*k4+T%Ih1RC8%mLt^}dF?ExaTcGV9 z4Dp7pvuz{SLU9wiYBFiAHO|Oj1rO<h7*^QM=}0N`uRQS9daf ziDKlm5}mV+zi{_#UtOH9v(h;&KQLFlDg*>m?aui2Aiv!}J+M%YclIixCc@dH2LhTSpX99TMaPgYYMy2g&9eY}0 zWvz)Zsg(~zOwy(8>1#;5*NM}BXPO;ar=DOMem`KQ^hGmAS}T&>Q2OK(T`x@4M(k7? zeTmTsdq`Dk3WnQ_uDz8FaE@U?;TNW0WG%bc} z9JG#spib+O@2D83sFXy^O-d{^#ra?^j3KiSEy|Cs)nq`BWm;5 zxAvEiY(-M}#-ky$RA=b6C^&!ZaIM7ppprr42P(uE{@O0R=BQ|?xy3J$Ah?$%xfP{UzZWH=nP_hg}@f7hub z4)MKsvUXA=Z8?{2h?gt;d>&2om4MfQhmvmneolt=Qd_ml&~Y$Cfi%PkRCx-8&QT91 zFJFO;x$h+cEo7kUvDBFLA6D#OoQ`N(ZFKZ?vw8ooqLYNB&h9NPFr&kJl2E}ytPXaojg-8-* zqx#5cKgV4L^E*8sxD^2u$ggg{z)#S1D{nKzwDubr>D6u(Y3qtyTege1Tsi+Vx@gbR zXqpNh1`*Lka&x&Z2~{um$?s5$bXJs&r<9hi%ii`^|8vwhdYh`_N!ufiY5#$xvMq{4 zk5Up`o0HTuf{fsyA2iJv^!<4}VuOXhv445Jx;8p1hblGd0PfT-!A#_rm@+`1L07>8 z*%)-uzR}UBLZGt7&}T5s$n1eKaNNx==QAG3`;r5^p<^!2_ z$VEt$D91H9d0zs6!)mFaG+h}s_));~cg%%3;dp9Zy3iN6@q?36E!2*8Wf5_%a}gao>uTNhj0m~3uUpQvDJ_Fh`zy2`j}RZ? zqXj)JPN?Gl2SfGKoWI0-r9c7H=gjm)DVcKazOE{NL_MCM=8&s4(4simvQz8ue7Q?i zMG|LX4Ck_hi`i%wP!YQ_Q8TbhgZ8>;q1YKeSFgZo$d9jT4rx@HcUCr1EN{GC>dLTA zN^@I8!2=+Nd&+y|CtWAd{N%`wN9v-_b@W{Z-4M0okvc-C@-Z0qC1lBqBvKGI(ngZJc)k21Cv|*(Uab998@x-)|@jO%mf{P!XM;#e=V$L9GY{YgS zdD_6TEXViQFYE`W5iTQp4@E1m<%>LfqplDjVzL8erO*lp0(g*ex9nz+!IG*{eq^#I zTqSC$Pp1mkE^A+HIjbdQKOFjHvg~O0sdNmPW3xYqp~}?O2fxns*oto{plbmO#c2Kfwy_qI>~a@G6FCfzHbY z`&cpd>SF`(bS>=-tYUJ*^Y|S8A5`df@0Ec@Z1sXQeLfInP^pQ>%20}n>pkjveh9yk z`(E+%Ye&1$1%By(dm@Z`DrVVlBd$A zN0!=%?&|JdGI17|`z^s=Px_VXz@&%zrzaZ&7_R~Eag1=Chl)k|NZO86LVM5*p_>e? z$LaolH*fglL=9;^dm6mvSn6nfdg1xy03D%3!sX@3V}{PD&!!ifOaSHw4bgk%fY1oPoAe5~#b^ zgSFa+?&J22r=kY#y{ph~6PylNQ;3a$RWJ>Ln%&wmc|VV+nzD8h?J=Tbt3q*ufQ^>g=^tt_6-?@nzoXext`o zj8LiMLs9u(U8C33cHTIX5a8Y4ZoG>R9q%-DGH+`rRN_?%zZGhXYj{@Sy zj7p6g-Y7dE2$=PcZi%19UE)y=;C#1SU_H9G&<>4h>ubjvZaWxXKsG;#r=rkzJBC=4 zd(h{*n3wjtFWQ3z{U+1v-QQR)xGw)XAc)`s0)nO-!Kmg>2${H5q=jc-?+9fPzo#u0 zjPV^_bCX0cj}15h)`-Tdjec@lMk_h8a;Lewn@Mej)hy}xLd6sLiA>ue+ffa)Zj-I5 z-9q_5OIMFFF_F_SD8$bMBFYPT(oRWzMQ*Q z0E(V=)HjKDsnXb}iZ$pE?s zy)CH~J1)Xi*>h{r-T`*ez!gCyzEVE|CF15&DO)VCL(|`L&Q+++D+IyogafTL+BrPl zKYhTHqK{hlvanjHSlLm$DzIXAAGJOOPzHfvSdNUZ`h)YX9L$(MX8R0r76^}SOt@oO zT~7wcjEDm%NN)G5cP#o3Gh9JHs+qVAIhZ@GNJRz{rWWT7ulpMzWZm1XT-iod`MQDV zZUnRMx;S?dOmWlYH4N|s^=pfB`q||&X%4&i|l zb$|_5ptUikkT`tp`Sc>v3qZfN3SGC`-smxyYmx&K+FWy|dSS%`-OR}q@rk_>1oWEB z=h8AOFQtg#sS$K?rtY~jjpxQ2zL_xEf~=V0kv}2qY_J=y*yTY(E~0B%Mc%z<=q&1) zH5TLsEvQqa-(dy*zPn4|c$YZt<`$3XXsBTDm8@fj@~6(+6)NMW`X9Yq`t%WXdR@lF9OoYsH*Ic@hp3YiPwQxcK%>lVs5*FKbIamb%oV~w5L9h5UHJ~ zq7#j$$&nKCJ!pr-CdWqvf&lSGw^p{dG16y@L9IA=0>O|mej(`nY?k_pn8tqfk?iJ3 zwW!+(1%x0#pPr{?s?V|4q%_{B%huCxeEfZQ+;<4)0(G|S07kTD&|1E%I3pt?L9IM~ zBVY10gjwHy)_SsaD_0?g9pKL|21L_b6o@kiNS5eKOt}1l2kJG$d{>}|37drEb3u|~ z&R+*tx6TSr3v}4vi3Plk>>3>$Bu`u>I$^#5Wv)Z|EW(?Mj3U~5jc+fu&`$>z#osE5 z-e%ri*#PYH{4V$I7ipWuzLbxV$Nx;d6%kVsuTX31SVTFBwf`6she%F$g}KKBqQBHZ z+jWbh*+2~do*^t9+cB?@loB-98=AY`7fuel3L}?5C3K%y@Cwde7AL@Ap~L+U}wl6C{gai_3sSU?(o7(dyd%zgCW|M$QbLkQxRqIWvQ`Ff7(>t0O>Y$* z(Lz{-$7G3ymLU%@y{&phPGVH*9}z=?{bg}KRndp$LIA~AFdp>OeLgx?CM|WzVS1G` zU|NNmCQE!wEg#$I8sGZrzc|fnfYW^WIKAlB88MQXpf=gEX?_E(h?KCV9V#6Ka&90V zmXCXPR|b=SyDx{2B_FVYrf1TwqalPrJVJ0U{^O&?d7}r!k3jnaTa&Z1LM)0bW$wX~L$R;6&_kVsNO$=GdA2UQu2NXqt=+Y9Pr z3EjaS^x-7n(j)tzfUWScsA!2|NUg0O+V$SvzFIpCLvGc{+6Z)dI7pUtw11=0Ne+Y0 zfgY5HE2a<=er8auZB2)JUtWew@LY2??6S#Hxu2(tQv~>kW-pgZNau&zTSlBSD%}$> z;q|RrAN10Q#6Jw*Mc%$2{);{t9dV#~SMi1j4SIqUY4h9}qH90AGSSJ(1NmMT5F@e@t{Kqg4nUTBQ^#cq9b# zUqfIXZ5iaKPwIz$F>1y(?q^#e+htX5vFF?8p1Dh;@1M{(MTOa<#-z zdRyEx?=fajom5U_94EKnw2503%Pn6>o;e)Zhg-5+ha=KN zG~az0_Bz4!+p41+&^{qjkv7sB1IM+WX!D}z)n+xZh8>aRnyl>4u+Z?6a>_NF^{mdK`5=1Ft54$6HiX#f76>m!Y0U}_dzoX58t zdZ-Kz@o~1CX=Ogz9mYaOtt(Qh8?9?PX(nf;>k+5Z-v{USXGp~B1de$Li;iM_gMpN# zfdKJhww*>b;CZ^kG1WA?%VW}@we4hB>eX)`E%S;wJY^FyTj6x20tOj;pI${Tc8rvd~6lUgv1{JyG zf_~ov2a-B`MB<0dRn&F1o(8nO0Py(9ztYc%Da1LE#FL7)wqoA}GuPbwAa;SnOQP%0peHEtcTRaX7NBULJaN^@QRi`uy7|)ndOr z+5E|Cip9L1h7m~tnc5$$!x@lLa_-K}Dv`*3_9H~pqyI|%b3*_`Sa;8@o4mb-DgLHR zTBnH@iKR^qT(|6=mJvU(`0To4xh)$>n$ux*NB|k8OlY+;IWe)ez{z68)Z3H4`t-vo zA-w9IK()TJTsf7Q7{F{l*kgIKgVl&s+2a)60xDSit4So8M?cb=-8tL@?P_+jFJ?e9 zGg1{oIE2Xv6pkUNJF#7sr+nCv!nDfR&%XC^l^@XJGCWER9yB0DFuZ*@PvF@P#%E^N zIP+@4b15C8zXG*13|XTLvT10(M*}7mSZe1HXP=(30i~*VMmhHrnorwz9Gb)$$cIyA zoCcYR6|a8O*hC=dGw)t(ge+HkCqa|7C)VtWB z=liG)B(U=8wkSyMQAT2xytTFG$2sc;-PMFUs9RiiE*#f>RD(pV(FV|u79jQ<#l|ja z#f^)!3b!aNRvamE9uACsC|L2)2nH8%0=5 zc%!Iy5KmE@0HNz*Qg~Gp{){*4*CAjpI}idQtc5!jV-YfA0LZTelFEJA%eh~HEJ8hX zeYG#=>;-G6%H<8?Gfy(WNo~(I-txq=^T7SagTU_rlwsGNOsc==;HX(2_FCRSXP#;c z114>Ggrnxr%h7c*LIfP&%|o0NM(Kmg(B_r?CxRW}$;z$LUkD@4F9nkQzCeb%neo#h zUDS4s?RqD{(sr$_iU3O=-374mqj2sa#!vpmQ?G(}B#lHj6GqXflYFIvTV5*z(&5bU zQl#*jEoj+lTtoNdx($`r3+?CM?Lfnx+Z*%HWrHPu^p|mxVXM&PUYODiHjm>~Qv1Z{ zmb~ghh}-n*gl^vp$S;!)yw%rtrP6I0Z(npU#;W*PlY3lugU>Wf_KJ*?^3QAng&6++ zYm=^HPt#oUtOnLzZcFhvrI!Mg7v87GAz!=48fL0CN%*tvjRi)gZv?e7rhYaK99g2e z@3uL3>fe%w;yg>BJ1iS|n!BEaeu>Yg@8hI8mpkB6v+#;F+X~Xb@u}Lnfx+noh+To6bY5eqiKgq%dVHdZjmyl9N<1lE9K}fn< zt8rp2)pN1^1;M2ik?q#FrY-L%@#U!pc_(0Wn~P*@ZU{6jJa{qMqox$%z8{F`*D~}L z0isZ4r?};fmw(dln-bBQ39W*I3El3N$AUO(9h8pNTYTh-ZJZzKaCQlwL*k}QO(-Wf z>bbVdm!1c_P*O4W|7G!oY6?*2qcLPlRyY8-pJ|62D_+%t=EJ%E%V-iw~}d{Gcpr;VG8uqV4`T zc?=|=@@h5=C+9ig@>4~UCz?$T(M9dAgqV|au2U@IAxKe$Pg8z?<2;%$yjqx{#< z3gvl52iW1n?h`~|T{{9BY4Boly;}w4OLdM$a4t>!P>9^o9jKN;CpHkv0dJjHq`u?N z+`@m4ec@)_-h>He{(;hDcA^&Cg>yZ%Dn0xFkR)a_z%>p+ z+oOTQ(TDGNXI+HiT#S`~$==sD8Gr6tcN`9-AWD0C9SGM@yQzoahK#&CRq9P!b#YNZ zx;gv6bGwNWx=a#4o&!Fj1>)BOn@!I<#Fq0e6pqb``@c+$ym$n79!=eXcRP(WOpNhr zAu1glhU^k7Pr&`b)z+;-;23=sQM>s4*Ol6G&pnrJ1eA-hQqx$rzaER+oP7f}rgzrw zMnh-!LgHiKMUO=jVe=OQl07!dwa`rjJURV}a+6|#ZR7TJ-nBrflm@IjB&oqS>F_E^ zh~$Pba;J`BO7ZkC{9uM%CT;UPaw=4=fBfK?=vGzM@&)*2U!9bR`Rps&OXR~n*-m@X zn^p^6D`g(rKTP(B^lYlklIGL555*~@yi^WYnR9EIhiS6Z0b;Dygk;%CvkaRE>(U4r z7b`M1zcZRhE!cKdyiToQLW7voj6koBhc;dfsG8$$MM@F{DFYefoC`n^%F34(_*I|? zG*^3PIVY&viM}fZVLs{}?LNH8S@6B|=wJm#ER4Xlm0E;m<^%dza#kdO79QOEWKur# z8j|0Rso8%ldS|gSD-o;a`FLC0IB$KmRJw;+eADkzKnkdCc2%J`UK=R93mzcTIDLgi zPphC}c{SzZkKLUK_1HnuQNJ#UF1`i$0#0t7diRl-s(oomLOi9$kMs-+XNA~KB}2R* zQ!ww?RLe4FE=nR+Y9Y{Zyf%vJ;-GayTW#(zoQ9|B=o6pg!29yHt9}m)cn|l+Jp$IX zH^aVa^UCM%MdygND9ilOE~n+vQR!7~lf2Oq!b_`_CKWjYt^G+g zL@?579?9+ftwp=K-Jn;6Z6Zf5+Lk-y%U54%Tb-u%9j{M+{<{0gplfsrA?o<8%o!r` z-0bdgO>o^YZr%u8r!_{CTfSIxQQ2hSc>_>2MVnXo;YhdCP6fUYMii;%<$i`0hMyuBP=k3`DmhLWwDLtpZV3 zt3w7PvLlI^b(*&|yU2PH4>u#^%;%;!*GwNEvItc}Vxa#MJX`9yb z1WTfL&T5jNAN=O7wy;+7)x&;F1j+g`M~;L_!04o$f216DZ_UF1Egsl;JNx!~&sNxG zJ+lgJbl?~p@S?nbybHjXSA~D1Xv6*VN)W4)z_05@U&sT-=aX)s|I+5_p0s~ySY#L@ z;>gV=J#Wwq$w3`$y8GO<%nuE~H643pfTZ=zW6O8(2i6#y=SFXOM@6p?bdi;!`6{W# z=G#;62JLo?)Ty}V0ZwZ&=E-<~PI%;!14c}5ZJbLG(=pOLyL&ntX@mxfyr`v=mQD=p zPpBY&l%tL~wDy_9Sq6iSH<#g)$UUhmaJFE(KwgPRZL3!WJKEvZakKsxjQ8)1wd5Ex6OZ-4>%aaQ5IFNs6GN$g6YFyeFZ zjzHG{W@#fM(qxI~CJ9b=?VHJQRs>x6h#Mz_JG^~T^5f8HF3PO&7*bz;Z!Y5@B+E9S+)Kp+l?`tcAtX6rs_`5%?yiHyirA0F$mG)a zLWsDKfiU>x{g!KQKv-)a+$<`n#8$b9W)Jf$7W|rXzk08wtl_3n&J zg=cbIN!=64u(Ts}tAX^Rl26;z9x{VYsA)7U{Jl~ZqW1#(jJuPf)@No_({pyBz2o5? zm9c6o)i1(#w1{$2qbg$4kSz0o=-#@qDr1$g zbiJ_2HiyW+_oUoP``K8zJzA<=NG)c>*7vsb0hhkru>n+FV8+U8$i1)_j7pN2r$qdh zcQ2@?1FV~$@H^uOSzvid2Dd*&p5sOBLKX&!Rdn}t!Wba_@xv0W@x-S6H_}$No+VzG zBh(^UrA>_~aZ;R7ou|1@1ppS7g&w;vJ#%B%VF81%7`HF69>#G(j6gXdO}Da$bs1Fq zl&_zzmvZTDaq#Z#|BC@kcE3s9c#(Fk%e2iRj*T#12Wd@*i^b8rj|Z`Gcx| z??DmpdWbZNjz;eCKozOI7ovy`a*pqz-5$O=;fB^q=XC?`udJqMoo=CHu%4ea-IAQN zt{R9JD@W?pwlZAWCQ52Jiq&)iO8l+@qzwBX%Dj5HYGe1(WE7<#5g<()*S+IYzbE6z zp(JXQ9`zfsCR9qUh|_Zt@eJ30jx(Qh37(~|rPf&4ZpzfHLHYxv~nG`+E3sT4KN-ddRhyF@8B)mRTzX{I83&x`PGtc?vHVN`qS2_Ho1+bIkU(1s>{HNIN)wjJ zbELoVPyhknMBn3152{LKZF`gk<@n_AYsaLMEDIyN=477RZWU@68qJt9nx^LEA^%xv z34%2E3jb=awm}etx0*%58?Yh~!BE!eA%I6W9F`ODUx_amh3ge>s|LLW-5Jgf!Lcb-t*Zp$WbQ8KE6&=(TZg-)^1>hSxzOZLlgKMNOEH^|aLP$U@Y-KyIdJecWD!z^$xA^YD|qEFX(@4rA|9U2;e#5&Ch z?y7Z@(6|If>NK2~*-mXvgTS#_}ntpi5^w?ztaK;C~aBPKeT*IzZYP zqcya0%atWr+q}HR;jh#4m`;HY_2~dK|KL2B4^V8s7;t2uKSHnW@-hcf{Z&Y2%c(u7 zC)UR;;NiPX96Ze*cMHC*T5<@wEjmcg$7|zdFRD;9ykk%EQV&STd#9IsC00O$xP%bem3wcQ zN#3AAmaJLQ0o~i-yyB^Dr`K#yc9q(7I2GyLs2Ou`2+S@j@07Lq)~AyIx=JSVr69T( z*1^I3fGXPOyjteq(v<$qdylll;S^6FXq<9h;HpZG9rEjj&lgMY9M}}UoRK^rV_zGZ z@}zVfgG<}#IlWMt7pY58>$z1F@gLg2K)z*!W7Tztw&404``|0|c3up51u1HB#+m<| zyp0O~{fM6x*?rsT#+l=Qq$_<&V*U;Y zOg8GydSB#niZ|EZmjLyM92^?}R?@AC)kW?hc^y3wX6b^RsCxPCzzyIVuI8l&pL#^S zs=iyk0GHmbl1VE>Q0fY^Zlc@N$!#i!;Y$^^!uV@cgN8qnAMUc26^$>DIkRNiP@mE= ztXp}&1k%y2_kanMVEKj9dXv63+8R1hjw=8}8h*3TfW|!^-eqPu^^^vO!$gQ%0B(Rm za!%tsSrWyQMDt9KdmeyH$-3hgPRq(DH*Ib23dUtrUw7bNWZty=&v}+Ri;&$Xma>8# zU`hs#(AKy3nXZGRd^H1cxBwL?ive&yy?QeHr+mdHY(J)Ccjj&W{c(MbQ2ru$(7TOZ z)>u9aD!9rw?r3j!WKNbq_$1!tr^f+LNh(CFmCUHe>0C!az)$y*KnS%UFNCQ>LvA0O zpF`$x_S{2kaM2aoVfQ>@aO4(JJ)l7%1G;C~1DE;q(d*J1>tptH0g1c!*$%h+5XR9b zrfVY8X}2+=p(hedA9p9ku@>o6vD6=36A#3>{YFMZEYSZ3HCFg!mHHz7!@$%VDa5!q zz46hvp+2&EEXami!a$yDQ;Q3ukN;nYG?%2 zj*Do`W`oUp{9EJbgLv|H1FH{qmLu3KBc2KqCCU+5u*cey=Jsg7Llzl1V_0btn?Jn( zn}7c5j5t%l`zHt*25}}tl95!vzZ5~VtdplGv~Za=`ed z>#lxz z)XREcGwHpmt|)H5`)X5Wy+PV>dR=sOjrK82FWFw7OPt$6j`pKa8MY~$J={HLXt|jw zWKSVm{Wi&1=5sv2h4uSq{gr6=GA8+M_$y8FEd*ab%^E@_FtywF18OhI&0|K0QMwL9|N%nS?< zCYSGY4_`P6Cds#K_|RLSXu7c_Z^^F4w8Xi%vgcG6{Gqyq!h;TJCjA{7D8;n`pf0n~ zLp79I?far0RCKM8j3Ke1lU200nbjC#zt0?#78Dyvb+?YBmi&k3I1kv@_cl!V-DU#y z>z>Iw&Uf76FEV$%?hshX*nDrc%B$HvgZMP1*Tm}M&ONy#z-~|@TT3B`5ZFfMraJE$ zrYOYKNRSrn$&Pa&iAhOG;U`L32+9-Aw}kuJ1}08~6cnTjye3cM?O)savnB(A_Hc z_0nnM`E5qOKH1Sfex1@65?`KSxRo?GWm{&aqFgXXZ1A9T-XJMKCpSn`ziNh4^rTV` zm@Xq3rxBS8%I6^8j>DYgnp7p-+)^4hmc8w#mYkCyd+LEAR-16~R* zQkY+0jW)W?Ov!taxx7zqdiY60Ql1E}F2`NFdXveEGm`5fEl=K$&or-aZal^VR`ByZ z6MUO{dev)j5{x%b6W6MsC9N<>L)G-htLx>d)tcfNO2X0)b?isq8#dv8ngAfxDEAN7A_r9ia=%Tf4PC@bjxvbuMAFu(cGu z0ZdALbFe5m&&p>aRVCspj>ZU0nK&Hdk{y(6e|c=ob>Zq%E(iA3d`S#}bCIfp|7kya zioO4^+;Apt4aVfcz}^Yi?mR*`0Bt1`b0MY!qZ&xsWnIxfHIS`-Rd>6DRjumWJR(x^a1E>F zSduC4Bb`R8O1G@-{I183d{;CJIV)$@z^AUHI+I-0(y7r}U0>6QZz=@m%*(`?G@7SG z>oLKo*4a48xdTN(^RK|f(}uRTIogO9{FauS>u9=V4Sg~sf(76))aqk{ z_p*#6d*SlbGb24x{==Iup3J3msTN~!U0uM1A9_Oy;m(%yWyZU{o!H_`lvkAg>*asv z2mgF>AUb;&D{<7B`pE==#nUM~Rrr(n`->>i!?2Y-(>nF6M{#k?Ic#djbJ5E5VRoUiW;%;e*_T3HRamP@v7GwT zr|&xqOEhnF>gUt4s%?_T34aXR+uK7ffBt;Oa_Go#&H_<2Fu)dZcq0{1;+ z;bDLp78CC_9=`g@63zLMZ1ja16A{-i^Hr^*W@xxjMz>R6e!UZ zEUu#;BU-QO?ewDfF&`gxw_5h|y!^jWbUjGX)w5g$t&vZU`_HNtX@7iP9xq^#p^#i9 zS7+g{=BipSDDS!!Y}dW0RPU8FksiNVr+d~idIR=Y8dvVqXSJZP3%Xjaqn|yi&DtZx z>zv{X-#=V1OhZ@b3R^W46-L}DO*Zi9Z>%}+Q=JdlsTA_;UR*-!ZDPg@auK=5t}hK$ z+H}{~WcQ-ec=O*sE_X?`?UqHGcQ4q|v85P~{cM(c+)``9?iiSjJ=?)k*fr;(foZD$M)%Xn>tD*bh5H58 z-O5{?R^r3$&sUanli&Knxz)+o_$Gg%QTuMwJgIcj$vM@P3NttL zPLq`WOR4dqb`?oU|B~*S508xKhoRv%d$fN+9;*(&RTwv5SOCPZ4z&c>zTv@HfsZ*c ztO@)(t!;8Mlt-@n7s&5fe&m>}bj?nek7V(T);21b`XY((Iq*gznN_OR zKY>G@Wt*C)Y)mz*Zlh=f2^aXOxXPK4#F~p}@Nw7ewp@)sbK5Nu9<^8`evsd4DhTiy z_pm(>TwJBCV_oChwTnGym){8G$v_AT}-#%&?!5ebq7yx9r&Z)XMnt9b)Zeoa4 zo<($Oz5G>*8W!Q@Uxm~E;UfR4sAL?@y5S2jr)1h5wD$b4C>dz^nwasVlx*AGrf!^9 z(*|SGqP`qsrBX^1o6`5Wa4VNp@*$dL?4ZmzU7%AWIw&@OaZL2szS^ww&`e%%46}4- zDb6(JtE&E*viz7?_<}-zC!hDS#$w;md9VE1;WDDclt_;zUNenSN1eDkx$!K#n%$N3 z1%p0CMNy+@8QaK(g?NdnAGqgm-s^$aKvgVo>ek@n;zn{0Obxj#Qyf&`aqYOfINJ;# zR&7)8=uBarNYFBCEwwtn3I+FStt?-{HQNYG-losf>yPZRT_I{+YU=Y>igC-T?>l5Z zPxmr?b=>D^);G8R&5eJHOss1TM1CdSt>fuT;cgAC1IeLzQ!!<0btLZdw^xOpuRmJi zT{AW?meD9M%t%nbt);&?*Tw>}>%pp}6=4VcN;ElKFG#*S3sgy)eKDfWg647}^shD- zdKXl@vUHBH-9OrJ8FUsseqA|SVijT5_HJd{0S!qdy+5L}AU6|Vm;#2zAP37ygmfT8sZcb5- z4b8X2$o^#0W%>c<9GG9@bC<028Sl^y0KxfAIb^g5qY{sTm=)IDwu_Z^M*;)pj}e1- z$;FBY0am10mA=VFD)s|L`$$Gb!kHuK;Hd2$A`3?QRU1qItk+C-eg{?MxHoJWSGRJ} zI&jb3ly*?3$vc+{=XpeZ>BWzZ+mois6ojUrEcxF|ev>9PqyCl=w)y>;&8ubtu^TLY z_N4jIZ|}w|S}pCy;b!7RJ$UPx>d`TCG~q)V&TH^`5oSg5F{??_DzwaBZEVD4^`7of z5DRj%FICpUeA|N!SC?&62!>q_Wh$IeX2Pj(ufJ z4+1Hj2H!Sq9A)>IzlHv`L(B6nO!r=iD<7{o6WZZq=B{}se%+GmXswZvfafD4D;~^) z!;ouH0)reBj1^8#1{OR4Ra>~%oVib^q^z-`t;+jgb+yM<5BDoin(>nNyY^yF&in}Q z6-e5V?3ii5yKL(tlQr=EiSi0Pd7q%t8Nuh{y2RIs+1Lf~!iks=i$&|Qvj^f>4|25= z-m6Ym6F5`2@ODRe!TbKtjPjo_yg}bhUr;co6n@*A0jkDv;xF5G6fzBA zW^J{7mj7G%#LoEb5$xTTS8WYvbIob{t{nE1#P2<>83IYU6+k%sR=M^%Yu)N_+ElJT zw>4>Kurn^-*eZ%!Qv&0-O||TgZxDh)mf4-iYb_pFvbtAIvX%|xE!3@yvl@7W^$dk- zI)A8m9HNj^sy{{BT^z#mNv#u;i+VeH53V!jJ(xN$75dJz=^U}LV85N%TcVS4-SJ+H z__>^CTif*5oAXf_#kEoCc&^Bi<7Sb?ey6B>tuK#hbVOjE(Dx4TPw1gIY{Xip$jbD_ zzKwYBASUv_@2hH`wk7kOJ2GTh1I2R0tWPWQv^-RUEI&!2h}AP65G>bxQu;uFCw(<+ zy}4gSIKyP0a3-ii>daXpNR2n<8q=4lMQIrYy{9Z}uErYF-u15M7m!jkygkJ+mST@O zcF$*!-JHU|FO8;-e)Z~AD=@e`d7|ti9e6PECV4OXnY!?(xz8#;8>-SPV;LVcCce~v z$oWvQn|U-?-B}GpH#-3?XWT#d0vR!=R(ONRNxbzqCZ#Wfm2WQO)X6m+?P8+(Se}3nvk&e4 z{3*2I58chh=FVXDGbeh{96UVgQj|IT?E1%ZyFHMOkeA^ zjLF<~E4|S?`YO$&hUBAxsXnM*$8f-l6yvkrOoav~70HmY9f{YO^|if?U^cD9;j|ep zicJu3Qt7W-AL8^bVzPNDr_B6(8(DaT`tA_se2wSGEca?V$LvDvCbROxwHOrd^DQ{u zyNJfiIp%LiX>7a6celC@(7Q;Y%GCOBTBw&>z;G$pap=MRrH=wHFV}Go~&aBB$ zXV&pfv&-NMOE#ivhb9I}EDmP0yo)CCn)5%(a6qgteIczRyM=c?SNW5#W*T3P{2&|o z{8DLrk;#N+&7FXZZ>i)jGKTN&L&dVbztY#Q`18>h5_5ANW?g`}Su-mx4Sj}PBm+5@ zHakzqhgDXZa~$z9=WItg#{ldqSn>~pGyotc~45GICKeEEINz&NH%0W??74T1@x%xS=32hWD zG9mJL@}N^Y%d-BBjw5Q6-Fl=?3F79&{FE&(Y3chP>s$j<%A>q7Gbb0ms_$9n7nbVp zHK=H_c~ZMfROJweSzY7uC5}yrv~v3vY|agv(RIlVLrp2is(b$&7U}7=sHOK6>jz4f z%DefV0)UBWYE4#RM(7jmY)v+L<_M-}`euaO!em-rJH0MpbJ=;L(ql4mvqJ}UX#ilL zRYS^lt9B_l<9#Obg8I3G4MzFiI`2Qw$a45P9>Jl=VqlaJN?DGP;LO;Y94mmZX1Pgy z9mam`+4cIFNN&eo3G2EspKdV`tAR(N7W)$2t#n`AiZ27-w%pNU|A!!O;}~E@jmhln zRZ8_X;!KQN(_bay%C})7>sxCcd{ol9i+%9$A*1{o_WikL_g4-mn{fZx-dL(;#AxZlXDT5NxpTpdESGc@#MKVw6Z^rGLryH?>m|2m=Sxdx4hC);9QCU==h z`;C#}Cq;Qb*AJWW1UC_jxQ+*12BsPxqas+PNn2wDs~RemS=G;`;l>uUvY(uhO>EOD zvS>FmctlsU&>t$W5F#>FNqxgo^+49u^x%GJhyk4`Zp|&m@azxe~sHTqAbFN$$QJcy!f>&Ny85zwtY(_|r zpt7vo`o1>4oJpo&m5Uy9>o0S23VyJKQ52-+fPadSSQ)Mi=eZZd=^Vv&3psb@J)PPm zps)kZJvx6-*m_35Y5i{<^0PYiE8tq*pIjPSWrByE!YPr4LWpf8XpNgRUHI%KW0spl zS!NPCv@2vYJ`_JuI_bAv#}Q!M?$|i~0t_iI=LUlQ4@G$r6PPpD`q@t@{h+&*sS zI!fA49P~o3)oH+QQ3TmlZ?C0N+V#l_0E|_M2&9QTaujm60UehZ5wJu?u%J#v3Do@h zU--cDP#`fxoV-2FB?z229;Ak)FO!*~1ae%lifqnXz#6C}AAkT8US<%mX!nz+3`f7p z!7QO{v6z~30r1Nk@Oj)`38``g8+eFo!Ao`!g{aJzcsKvRf=e7?-OOL%zIgY=LBUFN z&7k{bT8i4PWBir%@cy+bW@^I}+1%obTe8_E|( zc>Ov9u%`&2iXz$C@safGLK0A(wvOA0-Mo2|+wruKSpjrrU zx2rfk1B(#6b4P5wde9QEAwfzO{M-Me+P4{UBk!}em;AubrA3Q%@E^sh!7l^JG$~v% zfDE*(w(|F`-9Qr4W|zGEgRuT7@xvxXK{fE5=`7?$>PMiZ+2vpyWPlTnj)q7xNQEzC z@c+p)zzdxP-0K;*v2G&22mHmh{}|0O zmHdNbIyk`thfXaxFH?wU0iFno95_g^9R--LR_X$w?`y0`y&w`lRCTn`> zCQ#O!{qFF)&`#7M%l@yR+B91AEv1Ql|S5u2cNb5-|^biBcDCA{*FoYYNYQ+o-=4@3}8fC;ayh zUW;|VMf|mpZy&V##Hb4f?BLOX*=oe%xwB-9W+#MjKKU1a(AhKRev-)O7QKC$*=jy{ zw%ZBo%2`e4S<}Dtd)fvv0MZQMSV&_0{G853VyV8kvj=0>nabsy{`iegNO5w90iqSh z^sd(WtUaE9?I`9EhtbuLa%G84GrBb%83KD$-yPtprt<-{<1SB-S|wob590#^?O4Fx z6`5Y!{n4@92C_Q4;a=K@RV|Dxu^BiI*LiLKNeZ2gmG~{q0c4JbK$@+rnIpv)WxHF8 z7ltUMq`O-wpTu$f2}9)q-i7!sr&SfmE}6i~wD()nwLm`n1W^AZNf?t4wzeUDrwaUI zG2+IN^dD3#;a{Ef0M7v#o@D(90TeC*3WdC0*F5$FgBFh?ggxJYzkB+V)BJBwed8J~ zc)U>T2SGX56$l?Fktk=Jvsk|r80>}o|Id37z0MQTTg#E5zx+7rOu*{Sz@-wUA10G;9T+aK-{Aj>e-QHUQ*Z;AF> z$|Bv&RnvrA6V-0QnFaIZw;ajM7pKb82XN!&_Qz|B_P-nGRPV+RcScBvv>(zOlLaDMQ4MZyDA_)@ua(I|!YMMcNxs zSxJYz)gsRs6Tu)$cihN4_GE|lSQ?L3m|gmWpGo{He){8DD%F*^L)8ok<}oWJ;X03_ z7^r=>!bb)0eFo{Bj#0YUjib4^pa>y%?kwHHk_NX6<2ClzsiS_tyISL%Vj1N#w^eW0 zD`ja`1dHIGbl1LMnj5|9)A~*%B#Ko-y}ZsEVa)F^mkweflexSwJxK_^%>3>gXV3)R z7pA&3k=p^GNfE3=cCyDpBN+h3&j3YW10Q?-hbbJ+%rk!1p@?^uRAWL7_%v7-ZcSh< z5DrpLh3Hp8Twi_e*zacV0FFQ>j4zqy!E%O$+4DsG2!0L1<&>r)4-Gl9-DoXbc#JJP zSoEbJ%*dlU0gP=*njI{{9Ota=h%@DofSp>LFpKr-L)zFi10|VR?uamA6LWXBp0V2SWqDE6{w~i`-Y}; zlS$@;3!|Tz7i|TQ6D%Db58Yf8HVfRNG|pcS@W@o&jRDP8^C~@qgW$-#u@gdbm0BCF zxT>d#6J9PKS39YO#&AATzdLTQka|zA0oRJ%%>V;AOZ^^SOuX|re_;7(LpdiSRaiwB z@IwF(_8=659EH;l=HD9cbSa^FL+qAq-7VAQg+c1I!k_*n-dvG}8LiwA-lF;uQjC3z zs6{-ZOA-53_O0uDPD^7f+a(P+f`zG(4}(t4Nc@MN#V?d%l%IK+ntvp0yxgi|dM~wj zfpgh}q*sNNQYW;RF;J#AUDkXkNiv3xoG&+Cy+D7^BTtF6VEndGSaLzTdeyz`7-({; zRot{R)wb>^uU%toij$gtu{XVUXh~j1=1n!|-BSsBOT}j0(6BUZOnvZ{kepSLI;OL3 zb#Q(lCo1xpWYv*n~C= zx6$Da)TOo>aTC=`HsCS+Y#83wYXqPf$GqBw&O`&KD{(PIBi?4pQt)DQutvh_^&G*h z=tQ>W%xI$0g85Niq#3i*&qLkOIl*(!Kn_MsCYse%Au)EeJzrdoy;L|iW$aQ6Zv8bOpMzihbX7o|)i#eZzLvx2bDvXFY z6D!dv6*i;;DlC6%v>^bBTHs|KAsW#b&=XU?qe)U^Bgtp8BBWF15s8kRSOR6V;+6*h zw^U=PMcoZ2_t$Z~jz17cQE+{ZEX^M^K>@JPMNBSFhUYkE)isCHb{qrWzJ5n#2m+P% zbAP}FEnq5zAo(6VzO=YOl~;bM8Cpj9`4B zaE!UjtUujqw61zJdwl?mqo6OV&AfoHqzuC#8pqCIIVJ3OQWSWk^yn2(O!j_3lVOgb zffk(~N$;~@pbrP_$v$sGhVCOCL8!dW_ z=QrK_adRf3n7hi-aD(f(U?AcycaE&#rK?2z-t5`|@jusUnPr0+m#5g{#bZuToM-VR z(p_FDc=&>noeQx|_}fFd134#H$jV=WN*y4Ox}*E@s+oIhoOiO6KiIxo>2kqs@cBcX#F=b-QYuQE=GN}uO)F4a;fj+`6bsV`c`}e#d4CF0zm_L|aBFZ33{?C-;Wmw*xzzK;W>Yr? z$uU_5&ECY)h^yFaTb^#IBBc2| z!;Sk7lG^YD#w^$tSfZSlIv!@}Ei;n$RwZWKADm*( z(aL&e!~0DxMkCLLGMEj?(~rZ?&i75pY9QTDcx+h2L|D|srh)HC8=Y5;YtQZ0#Jf2n z)v9zDpLP7nR$uT*s+LT6hkwDE=w*}vBy;y623 zas6WAEO%XK(?jxeg^+WlyqENh6>_fksGV<-0-8dH=D7!_D9ctdwBC4RnOLgm4e<-m;R@(9 zWtWwxA*5YA`?LM#+K2hU@zY^{4b!Nenan8md z(s!@=F7h85HTg9#Wi(2kN7u;Gi$t=J$h96foU|O!3TKJBo8f02O2`Hv1gh2}Ih)!F z1b1XqOb@CVn|M7!L&3NG)7aYWFBGs+js}+sD!g{IhZ>LUcTrz|0A2YSxy&d@kDOy9 z>b&z?=T4==pk4SY$J6vpArgm!-f3;pXEA6nx1UA&pMk=Mdo~5y26yi*raZg|W2~qhj1|Gcf`O^|4 z24G7J$PzH9+G|?>;K<#z1eG|-n+FDRA%m56&dK!ekDikVqL><-lHT2no@-3+(wFyILWy^LiY|X zBG(7IXrCZqOYJczSh`+vAx7YE=h>iK(_4zIpSC`-olBhBL6-z{nD1|>0wXXK*hu2B z0W}6s{3Jgqm;mQr3#RxLlm1(IKK}r+YWSi9g6&_bh$$#;t(C2DN{1!)&~ALK8Ed7y z$FjKI9ir!E*8N7O2vuH(uKDOCD3kahI`2g7^F*yu9u8Wk*=;^L8zEoGllA>%$AI5; zGvUz0<}O?_7?W;5w7*`%mxpMYPj75~F$daQ&O}g4c0pd06_g@oh+|WMcjlIQ=vH22 zdYj-bg)Ox*nq_|O2R@~&Up4>tX6d=@igsu&V*lq@spAefpit#ft712P2hU&WgCD0M zgmMHiO{u+B>TXqg(hF-A?{0-HB!&M8S`7#G0V(WV@RJTn9GMb{v7r_4kQ;TScxqFd2oH?49>Q!Ih6n9{?Y|65&H2`Q9AO+D-5ya^6g+*d7cQ z5^+(V8~3)S7Wo0X#=leW)1Mq}T>DU$%FrG^_LVyD+8royN8k#u>r|+`28prgOITS) z*fm~Ny}W4P!o2F-%42%dy+d$lhcDfYX4=e@Jv7LhE_u?xOP}td%VAJq zXA>VQd3hee=wg_|5Lu^VnM;RGi{emu@Kb4tY{nprMmXkS6N{QtY;HNOlMah!p8lZr zKz3@s$NWS2aOX8XLtZdM_793~%wC=V4fN=`P^5CkcGBE>cvHCQ`bBRvEp9e z;*F7RB-yg|B4@(+t_qBgZRUB07YJh&2>tHJ;Y(QhS0Y7k1vIT1C;bW1y|ymncbS?< z_Cvx>PW1CRG4d#B$oE&LM6!y02C)77TpZL95vCW4G6;wyWRxS$n`t;;2Q6e$p0 z01XoNW^M!x7I)?L&lDH9AhYfAV$I$Zl1%l5R_&gfKWxnLSgy*AyDL-1MvgA?Qii>T z;XF@+JWGNLKIL8DS>hjYQsDv5lHmM0__vGx&!-0va(MZMUAzS32ASNBQYPzY8HVg9 zLS1$p7h34v5;4Wpr#zNTV#^Z2pBi4R3zLcGXmEbEmfy6bd6h2Tj^vme9ObzI!Ko^g zDh-_=JXGvs_0-&k;)Lu2?9=-6H){Epw$(2~@a{wuY|giaA6c@6kS5%`?+r(1$jScL z#ikj@fJt;SFhgEQGjsCOJEI&P4-B9~|Fe{pIx8nI;CBfl-0i~j4_x;4ETP0{qIQ(q z9`#K{;2Oy?+2)ONIND%+;FP#>07HtqsE*1&O$H32OUOpwxn}_CU*>J#j+k%{}HF%vl-{SKYzmF9vut!i-u4 zQxULTe+F=HLiLW1l6(GBam&%qulF()X3$D!ZcHlxes}e!=V5`k7j2@ORpRNM#Hoa-DVqG>t5=8s<^7*qnE!^~~Q@UOzy!W#S;byC$g2KgvX`QsZRfS5)& zhi#RdFG-?4VJURYBDV6LUes|ZkWujT@0d{)-%#TQIRmN7j z#=e1Ai7e|kVH-YFr$q1`>V=U%c`G6X`gP1KH&_3Pa^KZ~7AZ7-S_5a46UXi3kdOZD zvi*Ma(yeQ^jElA~5BOc&tkbCE-?kj$Q}e$+A#wbv%_C0HU5(+1^)7K?^4cP zv^!Bgt}T_>y{QqAtIB2|EE~&HuJ7ZZuyeo}`R=ncX40C!HY%4nu<=X5Gs`JpkBSj0 zoBJ9u+pU|jpM+wHcU{}&r_X!LGY{M>T~gl<&NCh7qdD!({uTACBw&T-r`}s&cheJq z9o4X7VUulun43`4cw7U2_rQ<~L0gCR_n5_R-a=2-p@q6e!{BH8zSa|eCz zFT?Ui3mZAN2a0w+PX?{Z>W`lMXyR#2Dk_=V$a!4d3gc*ZBNOQ{hR?lKYZr81 zcvvpqf2xgj<11he<8@(9*HpkeCX-%bXkA|bJb%CVbsOwWLG+cO=oyf+oe-S)l>*#( zRX;E-x!ITDU0Gnx%@e!OdoQ?qEtwNPMqb=!c{6jLl)~>*6v5I1&1+LA`|5F{d@Sn2 za{H;J&zfdlA5WKWZczD#S1d+5%QD?54i^;7i;|A*YL33syWkZ8e;3?2D(#U>32g}u zWheb{=+tMRC<3)ZH>e#pyWg5?|HCI=-+)UiyGASr!9M*PQeEK@T<%|S=!G0D5?5;Z z^g zDh`#GGX}UKY@&usIqf-Si!A$U8X=FEiv0I%L=_~=7vpy4Ri36W$VN$gnAaoL;k$jR z0aI@aWK_|q-W`4)_z{o~287{ESK&vjih{G{oewv-(vSI;~Z2s$G;e zNV#&#A(n&I@SJ9!xP=L@ptm5;5fu=64RrGXXk#&}QrNwR8q+#Y0==amtcshw?U#$e z8ke0u2oj$A`64o6m?^6|{-l-Eb>_frw}(W>>)6!b&}Nixem|J@bphd`i)X3N!i>8P ztg!0kt5#qgOzyde|Bi>Q019nUxU+=AaD7-kHS%_j^-p@wYwWJUfZ=<`e-fiSFzq~B01gQ;h zm2VM#i%`JwzFgxCE{>=^ra$wG=*O4+6k{*8&8L8-LHf`ASizMoRKS86>Qz3*aB;gc z{eefb&(lSuu|+v8vE$gVR6#sez<5sqYuqt4b98N1nwAyh{5PHe7WoyHX9cvWJ$Lio5Yr(0L9YKeoP zSB?9a-K9&Hq9w2$+C&FOTiAb}{l1bPu_gw-SKwvw(IL z?7qFE0)8$n?!Ng7DzrD2X4ig+v>7*{b~neSj#nwHSoNop>K!yIK~wH(j>po|cNQPg z8C0_w!WYLAHOi6z{PU3>>;z1Vx!+m)WjvQA!x zm7|f-@stBzI)`9@7qmD+*Ap%qc3vNoFM85-ZH5N>*YOuYkK+R1wnQ#J0f@KR4te^? zFQVLVUcMBLXtbCdxS4vTb*r6{U-~UbHe!B=%lPJkY(QPurB11@(%+Ly!u)r+$%Lt2{w!i5Bp z+a~)93~r&hP~U;Wpq2wnST0WZB=Q^f7F3MQ6?io> zfYI%#>=UVb~^!)$HcTX-k^tCDKqJRr50)kJENvt8ftYSO&P0LBOcW-yKm zuj6LpxnIr4Ot_7usQoB6j0Pt#qOM5!%2A(nTa#brXqfxn>V&}{6rC7!dKoYYV2bUZ zfnCW0T5!Wm!3oZL8n7d1tmGeO1|}h(n+`C?vkQcKdzmgBR%nKH&1czn^x7get;%X6 zTAGjXucZ+=2ihK3wO5A3UerAx9*}MOzBrGnR@RIS%;wnr?ZU6g{2Dfcl10#DRq+x5 zoz>g8vG&(Wt@>gHB2n4R(%o6819XPz@<~~1akT?z!L{Ep1ykm^20U_(3mw^reK5ZE z6H@16{%s6+A@U|B%y;qZU%s9Emmi+iIEOE})*i!Z)LJ^-;QB+;5B@F7x&=3Gbi~%u zBFp(;hF=HvF{f4`;`S?^_VO0T_Enh8XdR(*H;w?h+A{=F2MRzz*^a6Jc2)c55$NAX?F)QX-i;=uA)L_rJ1Z z=ywSqg0)s8mBieE{0b`)I{6cARXRYM$<3G?4j}vRD~o-@USEE)?;AEL8RWJD^xbQ? zI?1!QtZ>39CP-u1CtRHV$znUTXa}+A6mg86>y*5f^~VDGvC7btNNO@Noaa%1qOcE2 zYgAXnnbs2Alqv3sGbw*iZ2Hsr^!w6)2q!8_Mq|jgYj!V|8(f@ALr5X<2i|j5wXkvu z)6SgVDM{&TM+*kl`S_MRRn-$Y+MB>ZWcv?Hov@s*OUhcaH4xP3&5)h{Oaq34*@)?; zNdSS>CLe@h?*O4N?;2xg@OR@Fa_Hifzh&@WkzBhAYBR2_)G`>9(C4P~H5na8(Ht=? z;g9(y*`|vpTDj6zgDy_c=;&&IqM(cW9!cVdG87Yld&6VF^$%|wL`z|Vj)ef`&2TE{ z!EBCMzL=}9z}2=J@nQlmH2I2s&JZd*Df*7~p1lE?q%6R>Q&r+w@}acd>e=MNzXM;t z4d*|f>JI@$9ijojVxY%jft}cuP{)Rj-|eSGftx4fhwv2BZCX zLIh&yJ%63}+eU@I5R@MvlMlie9Qvif(-R6wk}zI}t#`>XRN4(IyMpGEBf$w5St(Ok zbx>@9{Q8RkaEz74D-7J>5VwZ&9BKx}^J|?DPWbH6LdSvk52i0wxG$GMcPW73m)ugv zpDlE8cQYeBo9e!>f&&(ys}pNZ$FmyyPi6!XX-co&d$?=*sPX~9eu@rtmS$EkI3WLE zmurb%7N}w54a-~qmY%1ejQ`lsIOw`WjwxNG4)Xikc28YEu_AJ5F+?L)pv2||gwT(d zFk%e%QX*OOi;d&2(AL%46%uIFx^xoD#`2a5L|gM6ngg6LwXCpMShwaN)7f(B0hx}! z(vog;fN1D-y}5gL%i7eM53|mvn-xmTy_QT{{ZyjeqG#Bz;UMc-h^F@!Xo1fg3MV++||6ul2tf$uN$%iC)u|R%ve|M)DSEB5<1v~`*R1~QJFet`Xj_iQjuyNJU9C|pB4NkN}t#_#E+)+U`t zA%oIlmCFoXWNYlf9IyA70`)!{$7NqiCE(G9QJ5SbF`r9Of~>)soN0p|n6bCM<$w#| zkiJD{{O{!dzn_YMu5-65^dR7u04}?@x)VZoGc2izXaSa$Q$w)8`xH{EJMq3l*opmy zAFV`{+f~X3<$FsdiA>R%B{8YmUikN7=JYD;a)RS3zgK=!TO@pFwwlg#;N6t1LK7GR zOzJ6@N^1xdN;c-Y@15qrDf_ws-ta8IculClCEJ(pVq(H&S<}CSuX(oMsz8=TLX-E% zLzXC3yGDnmR?w_UADV70f~^tUM9y95`7{Q~tWtt||+1ohMqKo}q_Ev&J{^A3uaYCn&RzN{Otw zsPz*JT$|Q8(y%DXrUEc!1jaDy(V_r1RmNyX<_ds-syu@&Coz zTZcuthVP=wZcsu%q`Rf1MWkVnZV)Ny8oE?e6r{V6Zj=~$umDH8yK_hxdZ_b$uI0k@ zJKKGoYoGnky2f<@^L=kTdEfU_%uR-eM=f+LOv$0nN>;q~&H*&py)L(~|7h>@bex=! zD8=54B)r1M3{ei$x+US;hf>zo@PtVyZ@p*rsOyxBbjJC zUle~X4+Lkg>SBEV*96D`q-Prawlu%5*~c>Q5(+}j@DVBSW!KJU|9mJIi0d{vrsJ;- zXB=yCZ>sO(-ueb^tJI|t#IQXiW>U1%9dId}P8G|T^6vTnk#$g2yFc5OR|roJM2@47-oQ6lk;csB?DZ&$|ir!HqKv@#ai!NJ29-8?%iCacD6coIEz zScaXzHp$Cbfk(gtkHCk|>$>hi!f*4YJVOCxKGWLIERQH4q0^67b3B~00pf3YUimnj+`ryW4WH7c~ZB% zt53N+X}m5Q{T02UbL*}1{yQT)%YzEjv`CcX+hgR+Z$9B)B3?iAqR(N$qC`KdY*2C3 zB%KIYX`5;vI?o-7QYLS##UXzl>SOz`a9#L?QFYw|8&IP^Bm!Y-eAPYhN*?mwlN|JugGJ}OPaKq_*VSSi$+eI znWtS?E5+>UJbiIatYlkai^CLMJGG;tESuQYuhaIs;@m2PrARXOM@jY9?eR6fcFz+u|vC>#fzhUkp-!Bry0F79wk+0usi1p5(ImkP1 zQrYi~T|48s=98-Cp6;EoqGRhdQXN(XbRA~9uz=eEY;gn#kGaraJ8gd1(~qR*K7XER z3(&_a+fm3!SK81JSoi+4Zm9NVld3uBZkeyuHV5X%jJmvha6JPiOd-i2f>yg*5UIrU z+ZC{-1#b~V?^pB{xGC@!TE1>>Pz&<)1K_o2@9@91Sv(qWk^#zG)C}!TQJTgnFX|x`F-6jzP+@^mR2aUw3B*VxjJ*k? zpaqDk*EgQpbmV$OIjX$Va{&0gKs{c>g^}7rj=}YPQ7g~VM&BDovy9JS8ssnqh@6sv zFJ19(S%ib2vSRD?$gtbJ5Mc)ovGsh++Z(s!XhPb?B0Y6+#lG(oF#GocLO&qIx4;ybq5UOZh0?I?V!Q3=2kB}bPK_9T-o?#D3{s!$In3p@UhG3~+b=N$ zvHu&S7%SCc!9P1!ma^p>FjYl6S`$^UljeyRv%Plc z^$;(6D!2j#0P3WJ-RWgjxEk!JdXZsXp z?5eZeI`z?awHPz;l~6yq+*e{aF5z?Z&6WuryB>7zcgrAz!naxRJ$3GKydBt z<(Erek6}-dMVd==OnQ_RTgll;tOixeNOCblT&ulABb=sc0OLU)_g>XE|659oDx0fT zuReW@yck$nleVu&7aM$d$@IkM7f=9m*8)!R>j3?@|I9N17gd*niw={%NuVN5O~|Ms z{i?5*PWu@vR#?ywC~uYzMd~=qk!fGx#Y>@mlitm?_*-!z^bz=zcyGa&S7mT0Ko;{E z-0+>z3#Q}md>^d*6)CLE#iVklqE@*yJm)3PeF+IjX@Jh$rAc>q4XDc97WL2xK5~LR ze@-Fu$x8r!2#|`ByJta?l{21c8_A)ophB-iKYTo3S394fSL>w@q&gU9n+YP)UT)~5 zsINdW12_~yG)Sfh`LO%$5X6a2GQgxyFdAQd?Kj2>?3NJO%8-*54)IF%FV0GeFHB`` zZ&BLz!bta_0DLnNsu2LWD0WDM66jc2TZ99~!}SpGg4XsFA|9-zM1>K#rJ4hgq5^FC zX|G$7dfIi4RxsDNEW7S|j}~-VVfGl<3U-87Or{EEF|E&eJ+uIVD*})eB#EaW5eP^i zVGp8+d(46c%U8s`L;Jk$GBCW;Cux!Ey}4CQ7HEAr*c;skzLGwXr}|%a>-O&6$Hh^+ z%B4thcLY@sjV-SEh`U>()|SOwgnCLBQdL|f8oP!|oK77WvmGP`_lBbkPaof968BB& z)BllwRFF}mO`W%DPw{Kt0YLLCgp!|7y>}sz5*RIRKNv)5K1E}O3Lf7P*;3_7H7B)r zm>sX5w7=tLGBYDt;&G@^W>{6g?;3RA%zU6awl*oG1P*{?M2ENkCUY9dim4i=PQ_r1v-7s;SSf^ zqCa*qHdUNxph)Ums4}(F#SQg=;b!~qPKi$)uei9IY)aq%Qc&DYm3^zxR~4%jc=va@ znhg}N{N65;FkTOBEJR+H?_jiBT~;ZAD;rB@XE;vvxDVFtyxzjc6I~zxk;s}eF{5mLUcfZ_Yir4Zxg;4o!*(_|Qtmp=w3 z=8no1E}IDO6(%yqab=b2VF0!Z0yK5k=SV9hMrGL?4dX$q0F|ZQfD)f+oH!n*su#F| zxlK~HSOnR4G+b1gN(R4ds(4{Pe?h_l#2sMs z=6%fOzx7zB5*?$D(LTO}{eahS1$BTr1RS^{r3?`3sYPK|p|abk*^8qS^K%!kQouA_ zxO$14-W}511=I> z7mwe>>B=Ay_mIvbICtZ^WB?;0qr};566L zve)uyf2?^eDUSDDh)z=OYk`p4@Ws*cH$5ssr|5zPqK|5v>t))-BMKFx)N63JmODDC zZN{SV`)8B*{1ooW`$r)Pl8p-Um4GT}dyJRf=68iXH5ni^&2KAL8Z1#b$$IEj3|o`o zEtgC&rHTZKLf=A_DH5>W)qGKccD2@Rd!Y5V3gTjdNkOY4NBk#UF$mq_R^{d{#aQGw zqYhdBRIJyX*0MX3Uy4*#8?tG1Hi``kYl1=x>yN0kOU)vA<>~KwRh3${cWGL=^6cml z!n-`*@!9eZl;s|uF-9%F>pQ+vLiQkbDB$UtQ-(M1dL1TS-F1rNs6TCd(rv!@*E1*l z`iPor4*7?ejBuNKZ2op$YbrU3WQ53CPdz_ak+s-zqfOP(w4vj1pMI6+QT&Lg+mPTG zyf-Dfhl8NR!0ERN|I>=gS3L@{X5=FVb^V(vufLAEXPf^1@6gBL254pDe*ZSwM zo!;?gq*{3tD8$N4dX?BWPA~?lEj;Q7jM9zOF1sf9GM6w$Uh|4C!r{=eD|1U%*-WQ+ zS55gasnkFmwY`>T;|B|N`RnqPE2=gXeA8W%Ok`;RUIXXL_`JFGyRu|$3Ljq*L^mkS zDJ+uCCf#J)HxRwe-`-<*$2)Xzstx;Iu(L)y%A-7v$r9`0+!>f>4_Z0 zAmi`7(~WRT)%45z4Z`1f>*zM5UlB~Y>kP~5TY9|cVm1EoR$s1(cUuaq(WmU8s5Sv| z>th>*nDeeBLiaf6R*$#=+i~I-j}J@--t9gX{l;a^DLvZw9{Hx10tLM{H(&k`(MZw& zB{2hb`{llKB)&Wc?XfYqQIjWm>)H1oXE_k4pmAK%XUh^xf{~R@Gqs5+M}Ac?N3G%E zc@KR2w{8yUeh zx4Barv4+CaA>!GFzQ^)9aB{P`ror(^!7GA;qNE9zz_5)5dO%S5asjiyTHm1EVUd2B z#rf?;mcm`@vKdrVtKi$%=dfYdBf3_hm-Q8;sj5k4-ZLLR(SW{uULxg$C*ah?(+o~xn&(x;`0dsY zV1TwjkW&`Wn?3R!X#G^lJwNwcipCO3rhFvqjWZpf){6 z&{1*atfNT)cAQMVqddFDtf+WgV`JisW1z7Z`?F zzdK!(>Hg(b79*c{_8jxma{Y53z&W?pG!3wVnuz-|qrnobTHt@Nvd96F?Fy@7@KdNA zBv}bdta@P9ZpcnWg*I6s!l$$7qMd`mSp=`C?%T6#>^m$P4<3Ytb0{qzU(nkK4LKNi z@9qy@6={3xvN;i~@<9*9NJAlF`mDw9ppeP16rN_%dK)hjrf|-X)sTVWZJco)+I`mz zsDS>kZ;K_z2Yp367t+e{{s*CICXkR2EgP>k(sFmmb5h`lFn8Sr4+q~v)yw(rr)_Tlvr#M zdJ#1gH*aQXwCb3Qk{1KppBGXTP8V&&1d`xHz1>vsQWS|iC&>S^sUN}UqnXWA4$04} z@+FB>;kh=FVW}66NAIw1MkkVeP=Sj$)sN=8$Qfzcev4ACPbit_VWJ%+Z66v{lG(4e zBQufPGBLQ3aE!EEnLvKqvlwqqGA2?`mWe0{PkZRHWb+g5_RrS*fNFYUcUj&tK@u4U zEDIUOV>`3)D=p>^>s{#3gCC3{l|E~CpfOQPDyv(4Sw8vnc5(?>EN4aBW!*yEGt0?(71ME<{Y@RtL&Z#-1CYv2q zC2Q4U8mc2HFig3B%s}i-gIpO`K1@!jm3pp1AHic9AcOIWqq9J4rtMcb9X!|5uWP>L z*p;tfidK=Nm3?pePHsnTFjQ+$+-(Pe2Y^fl8??8p8q!H?cWmf{!!y2rtK#_ zOPUU=2Xt`<>nhp^OFt&kj`XTp?ZU(@pTfqv0$+;b-fS8=UxOwgdUU-X(vhua2xzCW zPyv2-D!A=~Ug!65$)l=UJ%%G+XJ|Z2=;$ELD$oi$-REne%S`w!Mn1ZQoLM&fg`KlG z(@w>>VA3$6iE*&v`2AL|wGG@TDuCmA=#H*V#bK%1Tc!mQZQf=Tv8?g6o_y-Ha_?cC zpr7?P3TWI#I)2mpU)8B+e-U)#bD^WcR`qd?*HEwS6{lusxR=~UT``LYZ8W82R0)Ob z)-J*hU^yBmHY;vf&EZ7RTJ*tT`Sxc{a>-NwumeBc#e>i@$IoFvQCfNkPHJ(;YQUrk zb-ieh4?GnRM_B)Iy_5$yf`hh(*~djyx-}KErsc1tUcoJ9K{XDu^N9vCJx5kzkG_Q- zC`snAx{ka8STjNz?AI^4fin>%`b@EYIs8+)d2<~1Wb>eOwT-Lx6gq9runml`DQDT~ z4_tc3Z#wZ!yTs*?!=*UqSR%(LNv^9}maT?ug*7Vpwh$ffEjWLScCV#^RkXtU5w7}q zAEaEP(&0!EYjt4bR=#XZpPhC-KvSyNPo;yK?)bt;3L zy2Pm4Zm#c9xr|iMchlv(2A3OGfjP2<1TlkvK_b1{jBZV+Fda*?Guf*UL#^YRgY0|U zgE0G*GqZea-t6u3o&|)5n=@@mHJG(9d`HZtduigM)x<%(-e;*1DLzk5ZR(s&I{l=? zUHz92_!x3_b^MyDZm)i%CsC1n*E0HxrtD>ve4(m^*LM4YjLvJHd8=I4tF4Auy zQ628AQz@<6E6EC4p=-}C`PTGILfW)NIz>|Zlw5!J74ur!-o;CA9e(^OUWne4JN8bdG4&nHSlhpCdJKEEJS>4 z2BLqN4myA1`5}sd>%<0hI(;smIRe({+yTR1C*{Wfe!31I=BfmEm%X5}t?&7yB7-Zu zpEACs%-EE3Ka4?6n{M>!hYLD0?>X#AhqG#Tee11+j2|ATvoi&InYZgl0)8}vyQHK2 zBH`c_zqtC7c%VMCnE_6ml4Up1uj_GAH2L{yf)si}<78sf&C}0r!?EZQpDNOuH$@(v zUeoxdL;H;F>)QI%>{u8>fCx2RijRV0{>tp8r9zWZQq*j|Z@7eiNP^UbZEGV$N}wUG z@JsMkcOhV)@y7adLdV^2vZ|Q+ewKli6?ds~iRXi3>2wh*2vKE_5|QL?@3a(elg!FM z3T9*u=f%9!`5w@PmBBK5)VL(x{U8Vn4GRVPJfZOaK9j z)-d^Y`0-?-O1g>#)6oMXF%Mk>Fc+&<5p3_*w*@*i+E50rS8lDRI>eW6lavpc%54!T z-C~LRU9yX6OtZUhxU-gfFK3!W>I}Z2aR82oE?B7U?>-}|#iidD9HfO+#Gfho+g|+k)%QO3L+gUQ%H;#P z9Beg|EnAB9t{4LUJp5J_>0j{byut`4HI<95!RjW{j z)0)GaSJ#i7jFu}6h3)oJ0^l2yOS2eupprajv;Il+3kh}0V$4H6=z?|9rrrH1A~L}Y ze)~qa;t+0FlIM29;r)5os`*x(2bGe70?%7VF~)v7mGp-o$Tz8PtxEAr&NF14k(1qzSy~HDVbevNm^Z? z&HX)|4!@vH*LRquum*>}zn3K{8rQ?$KN~duWK#4i^Onbrd|9L>!_Ij+N!>M-@y&Sqs+IWJIlfM>46UYf4=3!Fg+CmFNg8h zo*@HzW!HXsib@D}@?rt2ST7qRI^C)cf*QGDvvQN+H6T2tOrhIae_-;0!0m4KmPcpF z+UysHjh3x~OWqj|)~WkE|F68LiI!IEL9vCM0DQMnVi+<@DLy$^Y1slQZn44Hkr>o!l7Oc0iE*sVDtW9sIU6 zt?Th#NSTja#*0^D@0qTN9eVX$P|4`{(ubKHKcYPpOEfB?@lP#?y1V`w7}}6K-ET<2 zZK%2tll3cr`=2-V-w1C73Mw7`hU-t920I#d9dN-JoJ4hjPQ+bGz$Xinra-Z@j^|Rn zyfRmwuU&ZL>3Hy!M76JYzec6T!@Xu-@BjbNiO8q5VCaO*%q`=|&VTY&#b2A#ec0V@wjh$O#L*~R=PpVJI}!OMM{ zxJHsP6aX;~?RD8xu9iNmJaFwI5LZcqU5=^vsdWg-2jBp1lRWm~g#3Eg>&pfZCAFF1 z^=~zq-y#^`(cXYY3Db=Zbl2?ZXXJ3K^Ez8hg3c|kg2#^F##{wA=H5L(T<<_hGmtAZJPYGl`_KJha z!BC>{bEnxC%HRziKs*Amxpq|4lV>o-w&A(EAZsz`TshMctvv@@osSlMvI%gX*%xjm z?qz}X8c*bxI?+J-ua~ZbG}wfkErwOWBY`w$5zG{JLSxkT?wg+8R9@+gp_N7`MC=iY$P zNS%e=os~P###pSpvgz{vNV}w9(i*Sq;cCk|Yv|Fab29}$De|`HBz~vvw03YwUhA7L z)mrR(EA-$z{vSW@4fy4^sok3SK9S-y=5cmUnk%^Jobw`W_mZFV2tm7;kxk_|AR80) zP_~UfKPm#lAxdKb^v)zX$&HM5On1i4lq~MEm6#0Tx@=k18;Dm|*ntU(^c5`gpSQq9 z>DkEzCPV3*ww=dsHP-*wATNka%+Z2p{1yZ^kOT|zhUPc}#QN(UCs=V$xp(h;(k2g3 zDU|ZK7EkTDLl(frqRxzD%uHsZAMEu>@`x+7*1k_Gi7|0p{cI}Wz5N^vOqFPw$+z=~ z?+BYM0pPR(b}9i%ZMkPenNqHLK2EL^ff7SW%4dJM1#dtKI?8sJ$@y!H zG#UEOv{VJEOn?xpxF8b7jbr&T1dJ z?$@vF4fpi7(<+8PAa6_32Xo{Ylx})Hc>cWSY0qR)YIl;5Yb!PLJ}w!ckFcZm0%K+y zgLK11^>6X6-d6cweF+sVK>NMY`BQZYqPfn~2qW`M7#ij{m zCq3LcDlmm5N*XgCtwKj0GlL3`UtJxo$sXI_cQPoDIfaT7af&ob6ar#1lRl~jb9+?9hb{)i@9q+1u-Cwtl@OxiR>OmH-6H~ zp>TIJji7!>j?jxAZ|b>!xBloT=HRg1*K)NecAbblR`a{?aU`u`{MVu)WF-iiKpXf= zWQ^Awr)(5;l{MD10^5cg{{g*LGt(V1xK+B}?%OhQw6|ka9%XQu=|IN7J>7ATag8Li z^zO}t4`?L3QL?`I+?En>+h7br+vZ*X_U4xBjlI_gFl>9A&+0f0!|Z#kbo*>+L*VXL zzNetUQKC?gEPUv1+ln=8X(GuxvTxjGEIPUu<%=fC%|+BXZE}nZASCK`f!PhHCsXq9 z2j+7v_JY)J#J7_m27GdZxG{F|*BbE*!KptYg6sTfa0P$>uMOFSr>Gy7QUcQJ4cEG| zkUT#5mXS)*7oww!v^XT1+TJ*}tE3VUTa3;qHfLUnzDmrgipWn5NSC+}1a!#FPTyU< zEV!a}Dv5r@|L)?LPG5WvY-JZjYu@S}mY|Ctb;e_lQ~#4=#5WUJVCuT80?MF5*8} zjtL@QISwA*I&(w^ES75r3iE8~QA4Sch^0pKV9i=+rWUoo_GVD0>dkKl4dd~G*s89~ zWHsLlDdL5A&RRa>hZRg5>Cd?%Ue%koDJ6?Cs4u#3&SU*?vKD&b?1okB^($ix<~^1V zv9V${p~41bPlq+&CD#N+3^5LbZ&CEXAB5vu&QjH-2FwB)Wx1N?7F~1Zp3Nn}!J9W# z@-LaT$1D2`Z&mOpFvf&lT94iKiM<3T=B40;f)k5z}Yu6%2(+=6TFhvicXC13pZ7)JV+ zJ&T$i>I^`hWn!x}&A}`)C)rQzvZ1zYVOK^-c8ZspN`{PC+g8Zs%#G@|eN3$`-ahu+0Uz&&#g{O?O-m#Aa3fNp{Y2=aqS&XI;)(Gx%cBcx-|Xa5vxGJ(4)%Bz31?+rKr$2fwf>f%XA z{e<|!MfhJ*6wSy%#xuQ(^4KT)wl)@nIavbzV%B~)>5|#=G|pI>S9Ar9wb^zRaLlCX zgnq??FAmu~cRrVa+r!M;Yv_5My4v>`>2*9^%q90SyAfw~rY|*OEDsh^|5a5o-C8p1 zi_DhpHh2Ej!B=Ih9JZwJyp2YdE~!GTDCL*7n+~&!&h8c6cdYE64DUD<41YrJui?fq z=hu&?DP{WZxMpf>Vdh3K#=S$WUGshxb6rv540QqBKVCl#HK5A1N~oaK{>tZET`G2b zq$!mbRcFAiO8p58!3>HZd1cTtn&Gw#7req z5T%gWR%BJE7w8n{U!p?W54^yw87eQ-3|ka^6mF4jH2S@(Q2&fmk?(*{KDHW4Zu52A zm-t-A3-=qUO;eGE8Y`#!N+l#9Vjz-d0}D$Kr_ab7w`yJ_$r>RP+s+RiY}Kupb=ZPM zuN<(hSmyWX`zOfDUu<>czzCs##2YxTyh%IPy5qJ8>XLPj<9nFcLflsDG$3&;qemY= z#6zIVwCE0r4%sD#hPoW-Rj z;9``GHl@@Liy zZ@Il=EZian^EavcmGZNiB`j1E1QFw7J)I&mFT<4vMDjIznDqW>7aH+X9y!i{S?Zu*}hP&`Pj<#A4Tc0N?aaqX~Z!YZ|Bn1$6R0I&9XY*Eo} zW2-Y~wQsmMmu+BE!wHH$DYmVQ*<4*7*XStE*i%A@+%q$%CUQ<^KI<{6B+n4;`9O1A zh&>+bG)%F2Q4q~Gombb}3!i>M$TWP!29VZ*t@{V0B>c|rEEdZzXZBoHR-gSKy&-gf zU_FEy!oJUFN#@!q>8QvkCbA6xVBr={%dd-dS5kS}kP?noy45{x; zOBozmn=PR;Sq{B0d_{+Os0ds3c#^n$)JUV5E#^Qp53$<4QCz{%lOKS#Q}#`1qCsqS z=PM}Q-hMRPG*BBMe}3+n6iCMYien#vDrI%ZvQ=pM(kg$}g+E2r{ZLdr^p?{=8aS=f z+zUNL^G^nAon!Pn(2cFSnERP?bKDl)1&A`sgGcg0(&NGG37B=2xIGk?lMp7_wk_E~ zvxA9s8QG#;sY2hXwOn9j9oaI`BW09J3n-pdMfy*K^Kdo4B4e3&a5(!r8L(Bk~|Q zYe`j=F2CF={tloG@2ER-O2W({P8TVn;p(<%N`pv%-nxTIM3ZIa3yclf-f zS@4?8bFQ|<#>|lt!O+<_L0-c!D5xH>j7kpO#6{(m>$dbcV|`{mG?t}xkJTSXooCxm z$N~T$8hRh*D=|f!vth$LFBko77*XoF{e?%9=I*{fq_YAWO8xVqGB7&TPFNu-V6Ag- zJQW`f*V-H(7BhM;bf`8<*~vs;dkc0R7`7lau|ET>Y zhyk>ngdmtfl7HMfKz1Bv04Yzs+wf}l4MdOCSbz2NScAg6cpWOX<1Yeq(ufw2g6NR& zI7Tmzy2^lZQ5(lSiK6W zWSHe<#TM7`&6UNjT=hEQ!f*(Y}Nm42XP-L1cc#`6Qn`o8q!X!I8H}@ucr_?aCMv*%gE4k~AP1Ds+?m z`VV%n6l1&;Z|PUT&6=`p#Q$IV98g28he}F2Bsgz(XrMG358h?y1=_ze9h1{42X8)e zxjNPpA73vRFO#U=Br#u9BpsGd_C;vDZLg#O^FuAW25<#Y&8Bh>Nc@GK!^U&e(>#3d z^lp~_-C;grJRAycRc8^@ul7>^-v7i3YuP)aXwhA!5$QuiG*a!XD_6H&#MDQ{M`jl2 z^_6~sV2@;YW&BoeKC}LCAHRCEcd0Vyl329y7sN7?^vFqt;Wh_oc}V6s8!AReGDS!i zQOkn>p30K+uJ-`e-3qWG3+OQG+0CtP{cH);VP{;i=fxSme8D~YuiXy>wN2F)shl4H(bvq0R=zB>YWr5EZILT_EF;1kt!AstD4?% zJI>i6olcA6Pw6TZ+IHP$EB!1o^|r0c*g6oCwWz#{VWRLg`flSV=t4N{U^KX`D7MJ@ z&`H7OitA(-p*!Y-v;r=nB_bk_sQ8uABY}17P@OnZ+NeFsatgUydE41CQM56!Xu!Zfa>2f)5JvzeQt&jnmx^pDO671eW(H`ncxZ+BHWi@49Bs2yjAocE@S{kUw}H0iY-Gv`ngT^2-d z>$s9q>G~di;TbLXw4{$Dw0xOak*QqAoRw@!N9FylnWauo&Nlxk)PDHZ zqJ%_ngRBW1g@h$lQ5NO;n6B0UPGKVtKXNMP{wQhwQ!>qY?77e<20`t;PZ)VtB>m$tLpbJo`fb+PCJtdSH;DtHW^EQ2dX> z==3=-HDI*8COBH_t~<8Eqqe;&M6klWdQ}h|45k>`o>uR~p`OZO`PX?9C)n!Wf_IS( z6_L+a{{{XQ*%QkV+xMk;F0)~NFb{c&VF^2a^GnMMj$}h5{2A$>-}zOimG7(>n;OHG zm1cM{8EHKA;b_?c*GLgtokxU$`*+7kO{1C*5`ww*>YK!spAY>DrX2E;GCsz5h=L)s!XP#TJ_R+WN z!;Y3fx`s%J$gTcjdL=^72i4n2j{;&ii0RyWZ~7DoZ+?e)6dkELnRzZys))=QXtt_@ z#$-VSJk;jff`H3L#*)huWogIRZ`5R-L7dK-?6+yEdgYX{1Dnn4dyn@#HdpRvczqg~t{31Nq;j?^@9wSl1n#Yl_1NDhB??S0Ybm%6WBF2r%kq)uu^KWe;`zy(;X&NqzJtak62zU@@=c%G~2_fC3uX9NSQFRA1G2=(!yh|0sL0mFy z8!qETFRq-e)dJ}NkrIwXAVh!F;#B@qL-`^k6m*Jr)MvlsC&p#wd&t)6sHdZXO_j5O z4NWw!(`!hbH$1nnGffC6Nij>Sbx0I1N3eLBp+b+|E#%c4B;&N+KG=yJ0jO%@HpY;d zc2k75le=`-t)3gf#+JH#br@v;Sjc>Y)Z^5Vw4@e~5&V{UhVLEok0^;Ma}>eUotZ45 zPG{Uh*ZXbE`Cjf$IfV-)LWNdBjAT!QytmVa8dqzX`qpxfJ9S$nUG=IwMy<26D{8vO zm8A$po!|@ahdzC2n(3Og3O5Xrzfio%jdXkQa#Yh72bTNS5L0=Sks~U1j85r+eGAbz zLy8NZ^ToK1g{OAYwbv6YZ|F%SCnm`Nfx`Wak#?4@>HZ^2^GG<=EIc z_Tq=4Tlg5R$C%^i+osmpJr-yxH>|s*3BsZ&Y;3ttN|9n_8!3lw@tHj!8PmEvUge}| z=8I6tYBR7@z2&3xci1FmwRb=_{(S*;rN#Mhs93f=cui}~VM-f%-mNVDeh_y#)-kdD z$*Lfs&R)NH=(^RjdpLsY=iEX*fwrQrEmmCtG%U}`&U?rI1Ivi za#c}EEZOni_b7xv#brZ=UoPHf1Zq3zmy`wmp;)n71kS~Hl91F9zy)=mxzhI2uj@ey z9-ke?pB>K>&UqK+0TUGQy||V#zt1)r#T_FvlNwQUSFUiO>|Wt=MhM6tWGA0n8=Bt{ zJ^&LA{Xe1G{+<&j=l4JM?$17l_(S~{v>Vu z!x`)rQq=FQA^4Ihq2FN*ZQhN|uRz=X-E((Szy#5lUw97t2M?K0T_?yJUSU~h=^c#U z4pnm!c}1!qH6~9z3&KCu_e`O$683yM`nTxx-}7uW!EXnIdI6aK-r9m}?)H#8SDBJk zz8zdV8;sqO?e)-C7`0s?mHQ$XRnYG6e+4J$D zE;2_KSRJxbA4en%(mF&^b0cg?Drp<0k4Orl~+{dXfaCQZry(9j2fj?TE z^Qm+(^to2YaXfnSl4`mFA#1ppduVN<#>|_@OG+YDC5Q4m56tv0MhG#7W;SVv=zngS zFUSA#@gzz;fP`)g70o{)uZm|sP4X~+)NJh`(T$5kztHpfu);;z?uz^bCjhAjQ9H{u zo$U%(3u0|v2Bc+${r@|hM-kCOZK^oWNu;$p6@$67UnnJ00=7a3L+50~_mil11M(E^gk^w_^M%D%k>6@H}3Rqu-$H5(6tXN?1$D!b- zvx)DG{{MuKKG=Q_ZWgXzcTzLRm8%jX&!50HMrePuM(tzDvCk|KUdJGyWxIh2; zMk6G6vmE*rBwi6g{YQS%nYN{4e00>gs@dVcFvTUV<5V^KfRGtZSL!bpPCKbP zIygG>nFhSS@Mi@ETj>3z^x{td(Du3aD)5aQ)mPNBEAmLgMYPCO>e-s82wiSDNZFY% zU=;EwEuxrKSs1_>*dtnNR_dn-*SrCS$`G9T#Bp*=Qy|zNe)m^OWAK6C%=N!X@;~Kb z@CLICT~HI9rL!$@SHe~w)Ug5%1qsOS0G)?-QBh79qW&58z-NB-lvIt8r+g9N9j2r6Dm7X_Eo; zoz5i~p*mGQjM+L<8f=Q7BoJBs<=A#Z!GWiUXMMG(27zxXb`B|xG+_eevI{|}87R62 zZfCrqt3QV#i$Yj|`K-66q@h}}_6sXR2;XDH-RSn~jy(HF5Sxa`947m~HVP+%JHl_P zil_Sw zoj0|RfYHx)1cBL?wihVh3d!)?A00D#!G19FwSY}^BO348G(=E-J|ZY^B`kE?xzghO zWvwLD!)z4^(t`Uibhz+B+=*-dxm8|}4_j*?q;Uqp3CPzsiv5JZ*W82Be937S07|HG zm^S0#B_H`7O&4+YLer7&NADlB=9fSZB=VtnvJxc4_eSK*=1-BCzaNkE6dA2xk%qO3 z&}Y;G2J!3-OZCh#hG`-oO0~~CO>5la;_fjlOjf^`e;-yAK3IQ4oWT>f=)^#K0dp5& z*Mgt^7Wi^VAf*2KDil*W{Dj@!ym^y;iGL+jiKsXlfFFlP-t55}%+u+VimW*`GhY2N zav~6J^2-GLM6~ic$Fp_wwPr(0G%Xo@{G!QHH9qr2UdG2u)%;!OWYxEntV4DY#b;pY z>K(Eat=#8jKSV0;FK2$%^ytl|X3p$oR^9ck!>IPee;{MqI(gdb_E2uTL)z`P4>tD( z-0@g*Zj8K0uZ|s%T4e)fP_zKAj3YFRbV|8O%HKa?HGbfWag4*mSyBdaK@cLCV3HVA zZal6fQ(N^N<8c60yY^aQ8 zr_Y6^=cX(62>~80N9G&vNJiue5K?BSOH5aEIJ{_1Z$G-dKS~n~}(60RQ5JJo@c={R(w} z*m;~C&tk5DA`XXjWxy;nR~M%Ic64Ru6i9B5rnR&}&g`nfhApIz|;#K zTC=O>7eCk+r?c`h;W_46WFgdo63^`>Yy?w$k?;)7Q66BIoX8TyX0KpN+Z0#C&+-|%Na?m zkD>GUc4fwAnLzfJ{;6Ggy4TdSzHiGs*+G{D=y5Y!Ay-dl$K^VQOS#gc(TRqtgs# z?P~|~m(n3*JSeqQMY_MMEFal6S+et-UF=H3Ts zuSPK%sNLrhHfcLP*f-?XaJtbNQf8roxj{bH}ky(liu3sLH)lWyVPz zb(%6Bgy-r;I7uF^jb5@C=UJY)@OLrmr;UaF;|B2=i<8OE6Ul?`IwFBr+ph;e03IK# zyLp-6=cnYYv#pl11hizC^Lp36ALZ}U}v9u*s1FOCY97$8Jh^*@XyO7EltcnxN@cCwEdt54)p zB_*X8Z^u?D(rHZAR~$s3D$a#-_B2p$QTp^CRnonUu@ z)qbOc|8N0IHzq3amIKnM2Sg|M+L%aZ4ILREA$pw_gl}v{ga9ll)66(clj%~iua?*~ z*KI(qDkvS^AEZ$Tu)o~{onATq+-h4y1V4|x9NFb;p4GXMI z|2R|tAIZ}$PrSN3IFgtm;;M|jmO@ptLtZxKi=3H8D9S7~+R3dmTo$?}$adgZ*nMXJ zR-PA`X-L))&ow2UV+qJdsO&(E1mtE7GEKu?Zs(z8CyS1E(NK>+PGI4I^;@XPq_=VA z**z#yh_CRVeaIijO8hoQQZ|H2~q-6!(ca{z3QbVU#6y!7KK5rKy-D znHzJVhP#KZjVuw$s?*wr$i7P*4F8q(qShX=#*@Mp|-|t^sN35>Y~w?#`i< zZVV6s>24{3p@*S{`qsukpWnlK&UwG{-vBer-fORQ=XG7TbGaSV$lC|Q#4t!Fz{%uW z4PbYuD7P`%69f>3S8LJSmdF^ToZdRk3f7sF>zWl$qWQL!F5^ATx66<t~8Yg361O5Id%~k=RL7-vDDmndT#qAnEw*Xij z5evp?BT?!>p5p27E`)+@^hPLvj7EIu7*`_~2MmI=#-F*#a(ALSJ0kN7CT-YU2|!ST z0IxEl96p-E$f!&Tw&hQ5%iN@KaRLPSQ~OPh{A^lohFRz0@Q7~YW|swTM=B>WklK;oRqM&!uRE0`5XJ6ou;vgadk)h*J6enkh;>6s1Ro50urhu>oA+uH}mPH{)=Ww0| zTjs8#H;v*F09)W-E$}YK&-d+?0OaQ7vV`JnUZ=evfCB9)Cm8Z1@=68XCP()#&;`&~ z=*J9d8xPx-mhFgm`^7oWUj`_Lycu>tAdXz^m+aizj=e5b#YL~a-9i0ZeLBb%kiHZ{C`P~qS!n{BfwlVM#-Q!7b#M1!Q@*l}b5-D7k!hw%wDew1 z-g(aYvvYt8sDSJ?2y1$@`?^vFmQ9v)<%9q*l(af0Y*E%TK>ldTcWos~;r}79{0Fvu zY5C`zN0tlp&_oHq>_&eER7LHY<%KaypH~dzFDg7`>3hw>s)f_l%30{i2(c=uQ)kC!*T&9nNd5SZ15AnJMFe4af3KIAXRBLFc zb0UbQuwOIHu-mqTLtV@tk)(5imZg0!lM_0N<`NakP3TuOy+8%QY5R;k##Xg0bLrh| zKB_{-z%~_>X2fPy2}yp@9>DzfQ4L7}tuA$D1_o)KH8N_!Om9{6m|2rx(@EQ?wYMkF z28sh87A+K>8Z&y1Vi}^CJ{R>;IjOf)E&Rh|q2pB$mG#$?R4tW&WfiO{hwJ4BM2gcN zQOE-d;h|6e>(#kC;Mg4J9CO8#AMC;uRT9Lbie$~oXQKD-cQiDfPNtHHQnG^qnYF=` zZFh+@GoipAD1#!ap6vqw<%4hRz46kd6mkOVFW2G(+C37(0M>6Eu4=<$zZayG-TkpE z2h?B>iYt1PL-D>s!l?xn&;h^TXmKs7crF~pnq-?0JPIz_Rl92 z=M<#S*k{M5cO-w@V{dicWdgV5=y-B`3eab3*;j-9d8ofj8o;H-K#%{GU!yjZLIAso zVbds-KJEL(`>d}oehc*is0W&UWMQ~{7H5Cu{ecJ`N{V1(4 zD}z>sv$*8pOKw9<*x{~*E3%%NxiKK zSFMks_tKfe@`7={d}G-~E=)#MDl03bNk381>hMd~Q{KO;(LP(xSR<9bQ6FVk)E)7W zl!}JttLETdn`l_hJet$qqO$Yz^D3LzGz-=6at?Y)&=i~MSr|;qBdypABZ_xsl)Fk# z(|B-kTom-=xb`2Okn4fKVVc^@CZf)auQWl264gAfC- z_U@|E02$b4+U+GZiLf_Zvk&u|c>TRNyi=AQ|o1d#OM;^7DV-iefg z7G;QoBz8@XuU_87_<@0EYeafGV2I$E zzxMcH90t9jgt{LLpNg2Fjt1yE@8Q_?8AlAnh8}s;>EtHsU%JFM| zUB5bg?$mSO5TlLB?xJ)LBr3KR{mvU~aYYZtI6^ph=Gbd52u#75f1j2_`B&!r_p!JH zjpR6CfY`-|f)>3Y0K~Tg)R!C%=%XrRhQ0)SY`Tvh?_F`6NUMBQ=AAiCGm~O5)7H_@ zIXwlq7=TJNsgUweTO#^)QEI=D8?5Ro9mJ~vQ25)CYzA)tmaaJWIsSsz`O3_CXP~CR zZTrJBMKbi08AteS+s0GH#H$nKqpNfzcJsAF=Vh-jaOWEYNh7O@IL@B%0Ed%S=Lc>U zda692rlFR|Px_UtzY1d6IlIPBk~uu6D|2X{IsXwZjXT{H8?1ze^-fhV+%X-m$@C~5 z+Idh93^~R}wnhxa52Q9$=tO`8ot)ur1Uty156VX0s3XFhuP23l^;>#BUeF#yty;V( z&|>}PlTfUp&2sU=i6d<6GDIiV<{-UrI}T#Vms`&mzI|7*65=6bEm#T)gQA#)zi(X~-XF!BE(NOVgPaVPuBTPIbYb+jO9TB`VTN zM_uCyh{7x+Q!-l3aF4nsR$UvK&L_?e*8P>3TzxJ`*(T{E1|%4uUr@=_{Jq=%T3PoE z$R>=GJ<)p)79HP--sGUyTq{dK;Ux>PXxgrG1w0ggVH3n|9Moql-$W19n-3&oKycuD z@~ZOhy7qtU1dn4-9g+fXVLDHut)Z1Vq8G|duH1wE@2*d4$W#^=@#1-MyHG1;0Y)h5 zdouB(&IzkJf=wE@6q2QBhdX;Z243%Lp{euE zQ_paqi}?UD#q^g6odD{p|8=InB}Rta(F4Poh@&q0 zICQP{J2$=947CwJ(r=Lee%z>#Lj}NA|Cb_j{m1d=}eV`iu3>vO;42Q?t+q)sQVgq88j$mBAm}r9ITqi>H=QtnXoIRi} zT!{Fx&76XV5H1fRkLBmoxZ;4SD5}%kMY^9z;Ed?a+UNh9S@;2TvoIB zA)~oR=CUMvaUm{=rBfjCQuf4yL4@Bdw))Z|Y{`L;p;bI1qZ2U4KsKq>1&)4x?1TpZ zLD0H;${rNp=iFFE?bakyrvT027CcHR-(?zxYOj})aX@xu>J6&~Q|UKOTJEtuMWMO- zizno^mimmh%!gIDUB~^6Z|2@hkLqre0<@giby$|818oFz_?iGcGzfe}9^AhGkL85} zfrEcD+JhB)3XM+Z8t*8q2M_A9jpWz*G{WA}Lz?yz#ok;WNl@o$u0jDa*=t7R-R$XC zpTLj=0Cx+Tr3+TqyQP+&f>Y7$X5qFxsv9KV9m!E$@|g!8sGVDTxa0CvX_xuyHD4Lm zqzSVp?#-eDoCCb3fw(+7Kwr+>G;r!%$)fwtp5(@R9!F*W)g?>kGApkOWNYush*=V< zlpM=Krc|=s+_FB&RGpD|r634y_F|J@(1lrY-;UP>CqVgM@4qRN{d=p+G2kv+)jESA z*e7(Cy=&bFa{aciYH6<+M_WR=GLx9X61^Ty!7Ox6H>>lp1WBdD9hUnto>Us0SXPxmy z_v=?+C(b>ml)3bVQYM5Fi^kKF;@nG0-||KQQ0m)?sdZF;Ze@YMKboBocmeidf&qRl!NJiTnq;V?FR$WASJu@M)}g!LB>Z_adn^6|3Af_KsqiAUTNL_O=}){z${DMwBM^;J_)gl$ z9>ly-eYR(?VT)8i`LMFR;)#kTQ!2mGd7h^z%B}4oDiFtpUO@JL(=Xn{TpUpj_?L=^Snw zL*|>9U|S-OYRYp1(Za`<8s}z!Q~x#p{a;`B)9Q*qOgQD|ru4wmQO<1@T2Q%x7`p-vlue1_4d( za_3R+re4YKDWXX2Wdv;y6WkKkp$!kaYaH5?2YML2Bg@IAdu;n|-0qgR+ zAfY7S*Y_`*;}pJVwyY*c zOGf7|01a7R6TBq2^)9GVwakD0tote|>{M)$uO*xV*kqMxmM=#^4}Yrl1k8S11>4P; zd8W{C^s;fE-BW+^xE~ z)&W!$^oI)b^csBE?BoRbamjjsz1;>dYp{dn7vhaXC`>P04i?~oSgw9_V= zbhh#CrFZ$j6N_B3=_sZQ2_MPM6G&fDb`U;eYOJbSKM*jOI{l#m*^_;~O1Yy`xvDg* zjDALMPHulpaKCAf?xGoeXAP{vp?5FH`V<58(|t6{mc1TzQ$Y0bLs-v$9M5O66j~ea zgV+rPT5W}z&P7nX;q1n{jUJEHGAp%tP|Y(vcrRUn3LEHCsdcqHm~Mdut*Jk&w*wp*)u$_^=lD#lln`w=DvP*`{mx|_5 zSeif}hpyUmc-_mZ-0>W^K__uVkbAv%6vP6qgA+P6&%O?&niej1t$VF_^>*VfWGsE; zIH7`9YK;Oa;C($NpA9OH$l;EiAjAJU;udtPHFngCzYWY^eWvh1)k_wmVvAX_ld9aT zOc5WurdN>wuU|J_tu~>$&7Z(#`HawO(_jYfP$Elc%Uo4lHIQ;yEA`EAk2ywC47)~~ zy5MSV5X?0qH*VXYd0+4Q*GS|J@Fy`UEcqOZbTjy5dqN}I*;yF=g6agZHqXoB>8D@J z2d)e>N?2dbQxRRy5+LPvRW~=Cpv}&CJ-M`GvH#Mnx7f?H-=UOlZm9fqS03r**B#ih zaoo%tNITVyv{QD0c1@K4Ad4|kmEje6UVt8*+gR)p3Zjxi40O6p3)&$S)Jn~A%E@?B zl8cj+ENye$-sAGj)+jEG0F$%sm9xK0gdEB57Y-nIW}>f8yaS@Xx~`b&2c}+}UQ-$C zXMJ6FHk)rXgp5f{yah9Nu208MEI&F;vTu=m7k4VIIH|MyNp4Aqzx7Qek$)qm!*F_; z*5d+yy%Z~8duMXX_%jF7>5GWpWT~#mI}@uD-6S7;wIONBR1}vD-cjsdKnU|orAN}EN#^Z^9$#EE+A_rxompW{y%~wQ$X^E8zTIw~J2l5dj~C52W9Czx z^sp`JF@ZbAcU~9b}6H#1b)3VqsB?js4cUSv*!~08%NRm^`h%pEJh2pt^|ltuj%U^ zw>8=%MrEIxJg&8KUT+<)c2Sj7e%;5Dr`sf#GM3&wZ(;q~Jr#t}VEKX>cQcurHWog; zWfDyB2qq+w8s1&+96GhsG~1HUnOA`;xhng)t2B9WTC%NS%0|KH2Jn1W0TR*o-`ez_ z+%*D9ly4c8uVjEduR*q&gI2Q2jAcx%9|+8S;v!YbEv3>_JKL0$r4urEHWpb#Mzf7W z5*+R;Ky7_{odsEI6VwKXN$b-<+7MT8FR8pAvumSUl(DYz*#*VwJtYHK8~ zLEP=28oPS>I6c;_#)MAbsET%}xw5v9!)1wjt(XvkX(nx5vfU{iwdHPHWF)z&Y=|^b zCpW)^h7L2BIg@YDV3fYS!uZs%&A1LB!+Q5=G4ZDz+TGL{>@EK>FqMnu?t5_K)PA6x ze`Ej$CAMCr?C_|c0o>Yk0pV?}?ba`b87cuKw?+$Ezp@Q1MRW>AiuIUnio&(hRo{Ue zEC*g@C$B+}c^^pg8$3Nbu@xaZblC<2Z+&7s@Z=@Z;Bl(~%C$S+L_bZEV=yZ}juhEM zO|HWFA~DZdslu3CS0C{$l1;qGH+8CCv;P**=tb!bbg@32l;(cA`CP+GmawQhY2-CY z=QQbc@daEuV%>bfE%OC~SuV43I;@VXcN=mTZIZ}O#p0iYU3n#xbOG$1GTUd(pC8CI zP!Hhh?LIBSANCO34{D9t@GBja)Icoa)u$rY7X=Y}4?E~9AKi^t%2E>R4w#h!(3F{E}%BQuhGIg}6k5)=4(xt6oy<)=iv{h3Fkr!#) ztHn=X0L+?w?RslS?tmuf7D;4%;YbzrB3>_<{OEO7Az2%jK$A}CP9Q@R=ffXNS~n-s z7?jucN=y(}wI%iW3Uqsq8@D@@mhWY&5VWfbI#Q}BOjNt-49d`^5yakxEQ~fSaye!p zY0zCF|DFb8qpNKeNV;jY6B7{~DcZKiy@g>p<)7`HVA});0IR)YIW_@Bi_V@4usoHC z5i6kjt<0VcGo{@gkP4!7VCA-%3@O}pE*w}=tTC>sja_iq78F64()Qj{AK*YH?k`ul zOvlL&CH7w1TG$W<4FLfk7a_&8XZHuKyGJwZ(X~XG-5KFCZW%7?^7;-;Fs6nz>8s~^ z_uBgHmrMPl3cp^n_j^Guo+U<3+=`?=6-#or^c^I|jx9?peB@?60HuClxA28FGZqhg zuuM+fmNZj$h|9E{b6JGlR;nR zs;hSfQvzD;K$}fsnV$s|kR>k^kZir)#*KaRLmi_eS0H9DQgT5XqUW z3745UrSvmfW=(9ao!h#&zpY%p7M&3X1m7~9ALhh-UIoJNgf6BPbs z`-+Z@;pz>iEhwzz^meTRXvJGQ{s&pB-il`>$EH{Bq75rvG6EOm);raV%u!y`98j@H zZ;4TzUQI`bCoxYJa3ojbk_*DZa)%OH7D`?eC}VAHl(U=hJ}SCwE1GWFYa!WhuYo|s zYV^Lv6lD#NS#OeQwHeR!Ipj0UJW#gppTD6%VIk`?y(eHVKeAisN0)_wr*man&`${5 zqxSL)7CF#xl+NF!kLz+Z+8{xvk_cFJR6j!38pgK-OftgdlvL|W1_ZKpiv&Z^$znxZ z`a&DGWxIReLpS)($Xd|_s^4*aWJs%up-O(V?;&WoVxpG3>yf_HM7(zHoH#8!sw+gU z&_z7>mc*4cPI?~hR}UZmf6KxkCI8&A7sbdot}`BquNh67nnbZ1i>`C&gRq~~wuMXV zrFfn$xeXRBZul8>A)`?bT@0h~vN1x=-F_0FX1|;3M>CjT}+87HZl-?EYTG4WOeD ze5Ec{XGukgQH2hqHF}WtO8s*z9()b~P$bvtzHqc}>va zzD5~s$@*Enu=I{B@{#rcZ$Oz3P8^~G#wq)s*#ZB?n|Gjr#ts014G95PgE2&cwDKo~ zSzH{=Jg78cb=`Iu*EDchJ?|HzGDa8TP5W)1n~jzUnhAeycy)3BQ(6>M7t z-sxp!FRil~|LFa46<%z$>(mgA-B7=z`bN^CS_dE2rNT{2a#sTlpPvsDvmFV=VKm92 z1c7c*vDCv=OfJZnlCEDrx*DnSq)Rab=$G$3!)9fTdrmMmg7d;feuq~ z(poka1?j3@NZVp*=(oh&J7b!qUZr07rjspp`*k>|BHm7g(bBSG$J3cpdA1~fuRu@EU2fQC%`Wl?nqoX3GgLkgT9}o-@`hEkuyyu$ zW=*S>l8atXaYa+Y=`22nnF&B8_Ppv}PC*+ac%wgGfQma`|Fv~E#?INzbZ~gMy|}Nl z9%J*vpJh>J5-~=9W8e6TA(bY!@lWew1`jy$ zf~@@!Nv`{85HPDf4!Wk34LND98T`3P#p7VtEJNHJ+>agE4&e9Sfc|wW@GB_)qxG*< zfJe}S1fe-SYDsJcu3E`08fTqwK%VQ2Vv3(VFUKKQ0j|OQPp{1qDoLvTzYyo*gQyRI zt#NU~)NdlZWET}vs17!@6ZrKyag2rnxMQmHfO~@=aLWJu9sh_e55#r>l-BDzHgum) zt~x-%NMP7HXuI}0--&tvObPm&5#P8n_;aNR&$SCjKpMUiKyT0?z9w=4%G55ssXyqh z#GJaK)kZtB9k2i4C6Vl<^Nf!QZ2Upwt#m(nZ0rA*fBri##}m1at@5*?eyflu&g~Sm zI__U?YD2W)tTeBW|50H2^P@J@@1;-kxOWRUcf}LO9S-PDA0#N+PsjF{4>8@m&G6K- z4g3E6T%sv3d6=U`D^?_@;@JR2LljkV79zv};_Lf(7>_{V9G|T8*`v196ttzx8?U{? z2A9V4kmjJ(F8)R+37bbW-S(GzDIYG;z{}iPjBZy}p2ACG4(|6;$y~L#>K|fgZ4mHY zfQ$ct`BGeH1{H*@V`h)SoxKYJf2RjH)qn#~0c*%X$ijyJ22Q-}JX=`Py!U8kR=~Yr zDq#kCp5Y5sj7vSUNh%(6Dk<*Y;@Td70y~*4*)yi&;I6K?*C+p3raue!ANToZq4}Wt z$Ajk+EC=0NBd{GrJ8b6pWj#yggWTOiP))d<$)^iyo{+!RQKohxvTi-k6FDbiu?TT& zTou)is%vCJ$ay5&*G-53?Tm9^zlVUvJFPM}hO>G+o@hA&rzt*su4=o-(sa*BNHcM*Tph(B% z1=jByV^{q?v~`vhE>>C=+i}$Jb9Z-D>_Vd-%>DTohqY@&Bq!d3+tb=A_0@N7Gj@U&LYJ#RPd_lopi7hzqhD28G%>Yd2 z7j%fKA0cZlH_PvM;HPYH>YuqM|N01_4~^jeW<#WM2tR%Mqy52E*+WJun1shJjsY9f zPDA^MqNFQBT_vt1NWKGP#<^NGwELRnbf#S9;iImCdRE?db`l-Rqcrp4{8RRI? zI0N+e`@wY&lL^GX-AfN(cpT{bqEAK_lYz8^QSBTQfe*td1z1 z^B1cKrgA5ypn}mC5Jngl62#smiw9?{9!%u;shfZFM2=r@b%SviLa48PWhp(L3sA%4 zpj{=VGMZ+-w3lp!`sTpxii*)JeWu2$FHYSb*K!NAQ)obRfN*KO)ZB?54IxLQ;TRNz zeUr=RxP62dcnk+Ss3%TPj2r?lCgAOBK(Q5cN*jig3JuG_r#l;F+kVpOB&`om+dr?| zaUq}TAu&nt9O}RC8A9czQpkf>h#I%rfUigk?o5QJsAWVUKmKbV2fmgZ0*9Oo{z?x} z#gsF(`xl@$+5>E&)eXnJczS3KFZb@oX>LnYwq`jd51&2+CWd2V+7(x2RP=nuUPenb z5A|ZQHTZLB{j1+6Xao9q&Cv=OD!_*6K>*3}WAvbHJ??fM+?aa>xYGVtBcO9bGC}nMzGfZ@%tjE-|&RS50!3xsKlB5OBu#P z1cE2Ll`agdxec)i1@S8`(2wM`Uy=w7x)BoJfM0*`m;bZkq^?gJcheyXHh7ok+EysuMP`wh%s_USebS@@v+Vq=qN ze6Ru)mQ)V|QTO!h-tL|LV(WO4%~fV@$8mo;H~B%COr5fzfl*qc01@jQm&K&{o!vJ( z4)Yg3x@O62i-!_kP*NS_xxd&!h8iEl(Z&zS5#o~=fcu_|x-JV=ifFb{34 zp}UDGYTtVjWZh>xg?voJZW7kX=6DZ?h!pO_Y62#|hV{?xHAk1|bxqb?DZJe`8>hYe z#k5X@988;(E3cHhapK$~NZ(TuqhPi{ILd^REjo(1(M}?Ns-+ zUnXK?^GR%~=4^NuH|!hmWqP?l@B{{){Py31CqSg5nK&&Zh?z<`U@kFvfftaT(?1v1 z=tlxs?0B z!P0!2_Eb5DMh`X~A3$O?u;mt-9O%V>8EOJDAgl|eWS$IGOmA(RGk zcS|Zv+HO(cvD3G)4OHWFz=K2=c-wKz49qYrN$T)zf)+wZ<65zbk( zzydFty|a})b&T72GKH?wOQ{ZPy4O=_Z=mtvS^bIDnoQKMG>$!P0zAcR2tsPr!n00B z{ZYeC!WCXtF;(Dmjg3gLZT9O58_IG*8)4pxvo;lsOHb5lsRJZ=saL@u8a8ln`2HD0 zvq-pUWkU>i*nF9T#2C}x)GEJXZlh57NWr*We9%Caw*_`CeDCvkja5fMSx^Q+vcK4Z z?D%yXWIKxxN?@)Og%5NZAIcssR3)TD(<^uRq`w#V&QT!m+^eFD94vZYi9un1;j{tj z>%sc>1c~EcL)mQ_2zU@mm)#5>@AIyg6y53gdK?~5zOP+zW2tK~`-B3fQ3j)Pb3Kb0 zer^wxz~VEIy}PBpHt^NRm3fo}SK}KN>#nuhF)c6v`il%`Zcrg2C!C;)@dD^y96NCp zA_a1*L1n&<)b0_co1oXto^~hg06$(tUKp!1oC}|m999nvn7;cz#3CD5mFtqM#%MLH z?wQx4iw)VQOJaGh4KRnU=P?weh7=}CEnThwaCd{folIHUb2Hd%^@MQ zLA=gq=Dnn~2D;ullB?R5=D`Z*dG2eRcdWr@c%u>4a?NZgzQQKJKRRDYs}<_Bb*bq2 zI@*xmk?64)Nc82(i@a^Da8~@;=1GStb!>t+;6gZNyHLRyftdvHbUkLwH*WL-U8}$p zwcU+ahX=a&1`gQEgP3+r$rLFsfR+cMkXU)>%h>|BM!p3QTQ+(JNHCg}5O z_aeyO*5!^Z{ZT}CB%Gi>5rRVbT^{Ul;a&}!@3$W{{=Qb^-ahyCc=DE%c~jWJ%4l^< zb^AJ_g5($>CC1;hEmL8OIz=VmWsM%c6@&B%>KBb6EXF{as)}Ly(Alei3)J&g)A5I* zT+~8B=c3GNQO+2dIZ>V)S?=h$P_DwZx3113?>)Nq5|VgI_F47jHPy(_!8012aI7e^ zAtaac3#b7b#BF^>t;V@h-rOx;zh2vTkpCe8WBYxDR?{b$*o1kKp`hES2F7-Fh1ssR z_O6JVy3Dy!y#t}k`w?6*+cfP=s6D}ByuQz1hiOSDOOI$ zf|(P{RP>w*@ll|Cty|!f#YQ^A;Sv3*&$*=pbgKC#QBOKSK! z$Gcc1$5n#SFLeN&sPtcqNqV-TrGbdpMdsiZFBeBskA^>(+mtMm#&fB~4bzPRJqeVd5W zE{>&pc%VD9RRCe8X{`gf73-?m=3THnLtwtl;cj0fytXT9A`$#WF6T{GQu16vCosfc zr(;hlP5T5sZ-{6}JWnhR+GXJAG%W9B-g2-KEb8cTALE*e_Zt%%Jr#QaOZG5!`YWfP zIG9G$V+$%4KrWQ3W(VVgPZ#mvs?7Bnly%Y4cgxIP6Zdj1V?9+Uu9*t-s!wsvdWi{s zU>nSuhi|*Z;0@5wkVM1$kCmT3=cWFlS>5sDv=APtyvr?t?5^`CJ1LjeVBQLbczzMU zeeOfHjNK+o1K}mBccd)euP*0dFa0~G$NSF;Qix-M=X+7YjJ#zyJX6kJU>F;zR17;P z&gXq+cNQ{{*w7O>XS&m+0fI0WpAEnZ`S1}^-TK*Jb{B<8CVENN4E({x0s0vA47<*! z6KeT7RWmioqW~x&@pk-tdTXz5_ea)eT@1=(KhLM~UWJ`vV^yz0XpzyTN1+KivC|qK z1ma8JYJFM5p(OpDN5Ry1U6%afT^O=yWNm3g2~1a=cMiUCbOih~>%$#*U5Y=^j;?0= z?ACOUczVxedY;|&ZI?&^u5of4ex%TNev)ElP(zC4Fh0Uh*>tJ#2W*kYHvMcOTBs{D zYm8M#vz873bOj$KGNw0zOZwAR|4s$&uu_lmnwTfJ=gT&}a&5-K%Xu9+`yy+R)%Ox$wYny@ntJhMaT8czy)k?D-aD){@O6Sb^!W?=^oenK&J1>kxk* zlq(^*bEY3N>kAdEOIb@(6-CkAxD6S~Xyq6H%uHI?P?O#EQ>e)(DdYTIy!~AvJ**bs zo+y`MR7=HI*z`Z&1JwX4^DUsdUEIi`i7Kt!^@z8Zo}y` z2Jf$4)pjOJWcu?skDW*Zrh+vzx`hwq!(b$pv!^WRbEU}oxk73iYp+Pq4%QCoI>r)F z?5{MdF$cI(PK&|h#f!gSUCveuC3&5y&yfRZKn6kMen+Yd;MW-sWoTs@ zA?t^yi|x$lHK@Yi%vs#+xkK zj;u56hVAhFb^3eh*Qnj2fA8R9CjtRTt5bI>-I@#770MZWz}47$&kLIH>~rj(4NabV zB`0qL|B6-frc(8mqa_hrsaI#xEr4nYTj{K3MT*+rZ=>Y_Q>Nd&0CvR`5)}gTs1z^U z9m!m8uWhOSi&JLT}!eo^vgyr?0lw&dHQ3Io7ZxU5k(tJ|R5S75VN| zSsm3(h{zn1q8J=je(m+WDuq|^#T2XEZ=Mou>(|0Uw5zW$4ft-59NxpZXF1@z@?we4 zT*U?Z36%EL%K_#aG#Fj5>@1>|@Hsn5!s$V$yL{gv{ft6|m}t7(x977>dh1EbNoONC zPs`YqyRBHgL|+%s(s<=i(^+M{fNV@BuU_cN%NcLSGt@(e`)LV8t7-I2?$YTO#YQ(u zu4l{jla2j$%Y4*e2B+|vZ`jpll+qcAcqSRX__T6ra8w-bhjKOpeu`cir^_Kfg%2U* zi|@+n`pksjTDj$_hs}bu@TY~Vwv7=VVK3JG&f(8)@NSfkwJGh&v$%lW2l(rE_#0g$(a!4%u}@ zcwnVB^N!j(bn(Ea+{YfbI__h$q)MhTMyHUPx>eHCt8(XVDrG!Ym~Ep=E#Lp{A@29O zx7(qMSuQQWNSN7TfFhMIoH?mRz{{dBwqn3*z*55$(-QmATKQ5Fs3g5VW_d_ZI2`Q# z>;OlN-x_$K8#r!{M__40uV_`cDmGRRb{rc-FNA;bm2n&>knjN$Yep|mR4+hyZia{s z80trhIsJG=130kKXVQ5~ph62T?Zrbh$TM*vSZHR+RXfM=S3XZ}@fW68U>ox0YF-36 z&HX8JJ(>m;vhV~&CPtCe67oSy%+j4L9i@5t`~3qqX7qx;gN^jtsf0pK#klyIsm+O3 z+%q=z(@z|pj(Rzz`FKmAdi_Km$Z>=BelGx5LYZ#Y{353VB(6w=gBNi}xuuM5N6CC2 zJ-p&9aW6(Y@i?vG*M4h4WwlgPULy@C0KBdT+2!m?X=fHhTqtV|3j52XFel`dc2$_t zNo=HOl}pU2;PR2~(?g*#kD0OI_4d_Ps<*N^;Ot_;>=}8v^JYTmeXiOdiJjVJyCUxp zUAeZO(0Lj@SMF-2Wf*;>`fxKoD+8yKIiBJLAvM%v^2NIpdho5|P<1@WYzLryTgJPQ zCe0lzCT?=kKiuCI+%l4RSDp>qCI$1si|rPv`OnIPtwq~qpt_~Rn%MeTv#hHlpotlp z7LL{j@?g;&hh$dw>C-{f8^NDisoVG5#%@_nZn|u(8bvg@2Twn^kNbO_fot3RIW@7j zc3lOO9tZtbtHB?^r@^iuwg-8QcNjWnrr1?bM#FA@tVY_U#@rroRVxu&pYtt)IHbgE z$p9AJW}ZVOR*qyubmUpiBRx}RHfYjfNSp-JSf#b6g}+iFCvKUF{0<{fBh?K=+e(?xi~ek zF`7R-UhxXronZ8AAXJ|`cl7AlFsSD@JFC0=7JOin=3wJN^-IMRBQCgT`o#fGn@Bdp zH)~Dx*INedN#BIU`ee=7)7LP;(7xyRXHT!+$S7_$kpLm-|D4SJd%s5()_p@_?pl|9 zQ(ulPA9$zZ$rnmn#Zv+|PKSwT6&Zv2Z%0Big|fxFtL~0+*!L|X0GF=8+1k+t^unn( z96N~roSZ+w%0bw2*f9j{n4fI$pY0f%&M>Wx%#FLY%0)WH#>1P!fI}t>g=e3gr*j)ELMf&!ILFi?09oUr~|o1i!V zjw>`=;%(5_*ceJaS=(xSQZ2k#6l*H@G;Ez~5s>910B$q@GgnQn9bYM#Xo=&MpkpBg>LM-HTjWl>NZtsnjtC?hrHvX9%$ zeq&B~yr(I)WpBzQRr92?+qUaci2N%&Mer4oKoOJ-QpUOWw<0K196$Uz1_;@fgjfHL z6nudD>947?XTXKZUuxAVWq@J={_ViV;fjsPGH32o<0|9Pyz(RWWjiDYI})}7G2PdK=;ke z_Q`<14xN%<`|EK36}-B;<3jm+VuN&T9tc-1%L(K`Pvsee>GOGP-1C3E*tEnA0JGEx zOK)kYYr0`IHcp^CHwz}I_p+%0%HJ3eeJ1#|*;QKQ#`c7cEB4uTL%psJQ!NaJ( z^pE~|lFpE{AFecI-kS_B*>rQWVBRNOeCxE!8&Z^89V*L@4!NAv6_Ef9bkBIiI6_O} zTROtXh9B`QL}}DtJtK@6YR&GM&D%>MQFu zIz&~UcecXD^1&F}@j5Dt0j}Y8d^q&!u!Az17Urt0MX%Y?9fvY2y-?7<3rc5A!&_Mm z&brY%nb&EkzumGc#?L=rsI&MGbh(4&zx;1HAotQA2fADWci*%!PucHG( zeH7TC41(U3?x3wm+ov0U5~3QXJ^?9^6=WLlx(X*A-YjLWbi1CZkf`LZXup7w*~+kF zE_=3uK$Nf;V1iHmksvi(tfqUSbPnX=K4tIfYE z;)h}EzYqMMY8lq+fSI>Tcs7oDJY*mP@Piy%%m+vU6KSi_)xB<8a7)yeS|@fu%L!L1 zcwJbjN;~#?WBpPM!T-87{PF!|eiRUM4V112a_EmO57yPFhNtrvBpQ5K)pnpHU@|CG zftePPTfKhumV958M*kk(fjLQzVifDhaMWV3(?b`nu!ED$0u~&>|CJ$ z^(jrLK6NUN&sy80Yaurm*_}(LrQZ9>)}+_0kFKgF0)d1mU-;kR>3a+_zW*HZl`3it z9aw-ZIO-V^K+o_=*fm|VgJ*ctadJD5e#QpWi>v@)=Ca^h$tmr(Zh6CRK*PIXPna5v z3cUmv{n3?sNI!*1?@E^H!D_F1sGu?lIBQgVe424h;fh#-@v7Q%3LmDdF_gC)w~0E^ zL&uq~D%nC^Y@gRLA#+^!sixWFq&zCQZy`6CV7qbA_GN;Jr(XdCFvUZB1YS{kxw4-KwU9*660?73^D}Fw|LLYqFNfzBx&nj^b7CGx|j6Mu3I)6r?f_;e9RL7q zsK#04BOMARX$(fORIAeMBm8H2#ddbj^Agd-P#jcCD5=Lw_C!(`f#n*SlQ8`OZj5y_1`GoK$7uiAi53tL zWr3}Ba~x`^-er;wRI0xYw7)#&J|>Pc3s9>H=QenJrZ%g~H?d5Z{2EtR+9L-*kGp(w zi}NVZ8LNe9a2pR=Y)`!_<_UHUc{b}1$(>#{ZeXj%RjU;yma|)98 z@5JzC=4lM3us{>WT5g9tk^(-b=!akXVaoKc^)P@+b;6MXd{9= ztw7Qa`NkUUp=X>XZJ9mW77cT$+De1n!(~o#izfR7U!yyph^Ei?vr3KT226E_!|vD_ z692+@7?l7XXe!`u1kou=D%(wN5^a@QihM=F;mYZ6PYw{nv+)er0$di!S_=BVJ$d)Sz{58_{(BeCD07*TDL0p zf#0Kcu}wQK%xg-|K2SqbF!Sb`KrV8*(zQK*?p%Mh_3y6EyiNRx zQrw&IP$oE!tQ_xrhVfpfuLCuii3&jG9m(jq&yRMfKclei<@@@gk*e^Q zme8qtg0ILB1NiLyvjV4qD%vw(R{6Ev9POdTAQwePtuF9f=to<#hf3l99<60baLL-- zj2GzQz%qDF&|cj@jsUr;oFP4On{v>1uT&|^bo>aCp3H@D&8mfb3%8&|-5&@I3_=d@ zne$qP32e&KbBLAmb#8VW*np>iUF9IvbIC?Q()q8^P*C~7m-Y%hFVlo0IlAw!}H*^z6^7;eF7F}uNP*LcO& zVs4|8N*NJ?j7bQVV#2yC9s=&=%2eTYj})E_DrZaD-$J;JmZgPf=_p%Y3xa=I1M)?6 zsDJ;i{&u4Ym}xb>TRsfxgsV98{Gz8Z&dc~)WA(6@;((igDer4~>I!r(G}dkk+aJqq zrp|CXqeDozUBv*QMl^E_V+CpqLHu)yxfTjYdGO51rcy2Oz)eTGOA{XDgzF$DZ1(9| zE+hl|kx6Sv-l#62v+ez=o?2TVaxJ`>jPf+KIXKiX{$LKR;d8)w7bX=hf!@?s7qjof2&}6D-gL9mItH)$)JH2kXbc zYkOe`LQKAkec>dRbL|iQo}wdNj((zghDH>73Qtf_u;>zmzn{#$YS$5%QR7BElk#X2 zXt@EFSNhk$k5x_^Qw!>(w{A_U7m{Dt^~qUymzN&S9LFgRamp&1?eU<<QWJqWmwtRuy%Z)L6a$+ zNvMx{nJc^T3zW{GVv(=>6?mWvi~X9~jM`sS^qiN5>?_T7wzz0_CFQrQl(>@lkP>DS z*x1;?;{?QrdV!@bA;8bRR&@To{&Dnaxr4&8-0>6V__V>JeY*!3jSZjlO}o2M+`?3D zaU`hL)GURqWE_OqKtvpl;6q`H;V_edy#!37?EBq2E|rS&+g0GG>~gQKaRJTAIks=x zN5$SNASl3hV~bmcP}d*HqC@JzFokJst@B%QulH?LK9TdisVvKHP>k%TC~z16c845- zYI46HJ}Vsr3TrQ2=vF}wMgd`kr>y&^rU&BnkO-Xl4cpAmaC*+8H2EnR>_EOosfpW= zuRb2h^d~0y4|@UiDD(TCvuK7x=1Z%*G`S5FWfz|&Tm+(A&fs^H)0OfS%##~}d-yJc zwkPs!LrKLp?q@^2F z8U&P-7*J49x_jtO=>`*|bLdd%7&-@rdDg->d!N6&*!wwe{2tF2lCJ3_KbQGAb|&3LY_ENi zl=m*@{%_3jzu!x|j_XQEqnm*lxr)^(0AQ@<=Ls+kZTy^1l=*wax}?EAH8kzI^!ItY z`tS_+drt(N7)c>mp%I9(eQl4kuHd)JoIvt-4k7}9m_Ks7ak&Vr8tPc)2g7h=hnOXu z3}FPJ8MF7VTjI~00QE5^lZlZIG{xrP~xZaO1 z_YT@+SLMO_m%P|Dzsd?k?WWQqF2PctG`OZdA1`^uZ$Rv?tQX}0Z~6@j?o#FIclpsg z_~9{6BG4@a5cXZj%$ONij|SN^)E51h1iO);{=PaE0@>K~JM9e@GK)-L_MW472LGUV z{ZB;_;`Xv(c2W_QbCAN{Qq%jFjZU8 z<<4%v;UQK$1$YDheOmRU=p`ErNWE=iDt#^Dt)Ciy@NyH(APB;jRV~3{xXN~>6iKK z?V3S?nuZP;_YY+(%c|4M&pVV>;{XvVrT3EJ=e9OEL%$uZAi#Sx2W^HDQ@(rr204+;}is_*NlKjm)?b;3G?WZUUANG<+g+71dYX<=#qTy(Ag zEo39vZ+bSM$sbV^A`+jTmsn;Hq4Ph~brYn7e-)A!N&8eqgd5-GuXe|KU2LR_bABLi zW$3CwKrTqHIGG(15MVYtl4x=S_QzPJsH!d7!_zHZ6@S%8lWzn#jeU9M-q?;Q8{f;DhI2f(d$TZ&+C%3GNO{k9oAcS^jgHLmOwmjD_ossKi-gUoR|dym{>VVR*=dH9Ek`*=eU zp^P7yFf%33Le5AzAoeHQr+F;9;C@w+w1hy9cN+ZHLXZdGNgJ$s5K-W#@L&MvDlDCz zvvJI(yv+wv@^KgMhlr$+tk#A$6xOI*sIZy5qyV&n1i?eqbe1`7=TL=I}n2xAlrLoLXr!E2DtlB$HzK?Tl zFlyP!Sb27rzaiQ{$1d|i9MG*>?nulR(xT z|9Zmw_6PqYwfuqS2Kw>TuQnv9{;F9;a@ax2$75`A@f@!Pz|mZXIA&eBi)D7Y0P>Tzw?ww!0 zkfF3d1Xa51=J~EdWi(vwaLA?DCT5tFaIiMA$F$!=sN_AJYSreo+V#;Yv$+>59ju|_ zuzk#Psx_waM&$-sSQ1?k-=)0!xkj;JRc(ghux-uCxZJI76f3vIo($l{z$S~C3zkj< z4K;V`Q4w)~*m~+xGb0cmU}*@Vv31;tlTt-FO?x}P+I5nFIE-Z3PzCl>SB2Q#(JXy= z^Udq%FG*yERjMH@W47f$SBy)Jq-b>Wl52ka^cZUD3D7FD5@wQaQAT;C+}nIRvo-dO zjraK$E3fl>(Gcih=)@AuJ=)edmsc~r6tTD6zSHin#akc=lDT9HgDeE5m10OiR|#!m z#u(j&VM4{dVT_!TfkCl$@`zKVPfi6Ucadqrbb*ZF*>|WG2G<~!T%);Ky^6F3jLJ;UfY9JW&(wLr|6#iKk3HcCZ8Q1XezA60gAGb1=r=AUSJ;kknSf-WrEPBMPRs81 zpb7f=v!N8P9P?yVmBr(lzH}XLl+(DM1HftW(e`C~^u|G_VtFrG1y1iXu z?V_$iz8mwDcz#mG)6BM6KLuhw``s(4D2WfIFz!5}1i4%Gi^IX%PPmXHjpyRRYY$q1 zUud$msvjT59 zaTs!pU?nKp@%W?VwUkF8@_udnx^gW~)X z0zxmtTCg;lJ)%`)tz$`PMm(nkkW|aZ&z}d$G}n=s&Md_v4ifkUeSVg%Tk06XxCEAU zbi0R17E3zC^Oq9o``k{*AgJuZBDl=Lm zLpUrdIL~nuKaC*dW7`y4X2kN(>7=yjR4tqZ>SXyyt>@3nPG9Dgs_B}P5#H}*y%L%$vcidb|5#^|dAe;K zMYigKevo(yna}fIP}oFqgI>y)v$Hpxl8a&oSM4|J#K z0x#wXX*tv@Vnxk+h5A+o<*dn7aQP_nVe^pPl{v~;t9p@m!HUa<;9qDj;;KH)O zGtd1n=t*DE-JH)w4<%ha1A=q>f0ac=2^X5lE}ir0TJCsMdPE_fTj6_ZtxPJ9xH;|ZTt z9(hnbJ&0GMlk@r%Iam8(J*0-qkM7EvABtb1&@`f}CcnTN3kgo=EX;M_k8?yYt_S0nocdLlYXV9H9=6zcq(V}%K5rS8koY1BS; z({BwK$&htE1m!7vGz&m?8V_i$7<6!JNdzknZabl?w+P&U{^vOSqdI<4c!8=Zrlq%BL^Mj$|NJ? z)OMp(m_F}V*xEx`U;D;}7ZfsR%LaEm!tw6WOZsg!Nh^?IH)Hi?nkr376~|RFFgLKl z0a(jk%P}O21~wZf=z!g-aqH=RPa(@Zw69)fk=V#@`iK@m#Dp!2H)mE$QTe>#VqS!b zSYw{kBL7s^qNqt+lXfimxS0t3&X`XLvksaTp`<1`n%Y`o8()J$*4V~pwGo20oFc8- zC#f;u`Y}2{5BF9d+0L8~=0oZwV!*f=9AgXpgcy8{5Y!#1SXijuSutGine~*7RA6oV z{79S3<(n2jE{^2&YDdm&()mxDURM}hr`*LTxL^Vo8oD7G(3YYL7deW$A{0QL*eeS6<_*5fH)Gw<#iADVWVF>6h&-@A>0AF`B$U}{dyF_ zEkl{Vzqbd};SBlGN(S$!^y0kd9BU=d${+R=uGmw8b+C)d6V+vcN-yz0~gA0M*t+ zR;}}0-}B57JjwP_dIl1u4gQZ-?9DQof(!t3DK{Y{|_!9vBh6FSz zNg>m%a?-{mP%^>ju#tSWTaNIEg*rc zH@KJfTXAeu4Zo$xQbh_`zb*yztt`z3^R>H640e{PS32{|KF`W47Tvp*dT&AD7a%4? zF}$RN65ztyH?2se1x`_LftDk~sCJS%7nOLLthZHUBm&90ub7E3v}M++_rORhgeWOd6vw-u z?USwN@08%OX7fDZfssug1rkt$OdS#(NPIJ)z zyiMm7>90=ZWDm&jDY>TCMvJ2r7OKNC!->xb2dMZ8`RMr*X7z*CGn-=!1%{tk&nGe$ zBvOttlhW|UIba@5d@Wqz8c65MB^G&>DJ_z?AJ&WVc3C)^AWX_@W#fVYMm!KGeS{hn z7_jXgJqXxFQhn#;lo4)Dv9*$*Fp6ACT|34)Pr-vqGlr^i+!mRo|}12$;4@EY5y9E;#WS z*I#+zR&=AV-&iflPG5niKI2IVYDsFT^9iDF3l>9=Oa^cDM+=Q~uYVkklBc3cym9{Z z1HI8YzDd_sy@h9sNPdo9q|?HX_2N{GPK)4_L+onIR8^eb9SHr*{ zt5IKsig1b#@sI=Dplvl})A~c=ayzugJZ2*PG?|d%wd;8qrM4}J@%x|ZlfJY%~>!5fNG*&#oGQrY-9L0=~zD)t(GBA6D4 zNN4=;e_L}mkm7S>^|Wh&U6&oIL&628#`M|LwjD`YY0XSOGgGP(|Kgj`*feq6 z$pR|F@Qm+A_$6N#(2Q@OZ)~r>KgShEuRvXD+4}4(NDQ9(C_z{xPCI?kn1j^04LH(U zlQDD$i42~HJt_juu0y!n#XgYn9r@rg93?TVs%vBX!=hcH*0??w+nKVccAaygJ#bi} zzw?XMapZi|XFlhmZ=cw0l~^0rZaXF|wAJnf@c5`*rZ-k{u30P>kPPLMo^y-z(_m#_quFrHT z0nnL#_acC5nVc`24VJ%by|nIiQtV-4x!>wHSOO8SCVwXf4&#Su+{P3>@rv!zT!aUq zn9E>Y*PMYAlMmL>d(hu9YdKq|7WMs1x2gg-Tk!{Z{xmqfwwOTr{OdIL)mbHn&HH{%Pzux=p1b!c36Kou&Q1BSW8;6UrX(vy^Q8EdR3rChM)CtWos**YdR(`D zl>0ZR@=kt=Zo%q?@xN>!dnKx?ZxvtSb|Q z$3B9e-ru-Q0MSh!xJD0c7lF1|RvPZJ382=EvRQq1kc*%RY~j%3#K1UMb)E-duS$i( zYM5vYFC9k3V4I?q#^jTN6E?@k&o8WSiltF_fZI5s;SP<_22&8t>Jb)qg_8HO^(yNr zktN550WpmF#~Lr-6iHJwhHBrr^Mg=zxIK0%FNtoV+to-Zo_6srp(JPsUbQFf=()M7 zm3sQUyKGWoB4>2rtne_WkzaNY+g%MpN6D@8SRJ}zCuHqY38+&~IMK`k8qrB@-O)H& z1!nn0+>iIJw?Y3I;rf?*ufUrkJRF1$FfZTsfH5DgB~jgiLND(L`Dlt5g7Zctcj6US zJ@PP^4V~&ZiZ$Nc*OdZT{jQG>O}bK-*Q|2yzPAM5+eUZaylD@9C}+IK?+WVB<9QsP z%KlQdfH!}c44jt`n~kAv1bdy0dBJ5dqPxsJPQ-MZ!F?Af9{TPJ10`mXadZpN+f+93FsV}cFQFFkLJ!F zqT2s@6lX{t_3RsJN7d0$`2of3BsB!)53ZlrcLQ*4pk3?{PU*n9Nh(s zy5QQ@(_>b!0ye0`xuyv2=)%W#`k+eHeOPgmc+-|s4K;_%LR`8!Dq8sUdp>fR3cxTr zAmD5*NwiP^E%|Y99`a0}D_A9R!kczbeNgZ0IV_6Xlg-ea1Vc(gB~-cr)lc(ocAh*S zLwfcBSe}PX`P?@ifpv`TUi}8v@e7l~BA0CVoc0oT?Y^-d!))no)0j>Aq<-(rw}i1n zU1Vhhgk}n8Iv!*gvmrqs6Ev;Tjrw3fl_jNl9nV=wg;$f+wfEYK^Z5t`ONM^)FQXf7@y=Xu#jqtp+X^ zfwqrybZ396LY!wlYa^0oVy)8nnMC(SSqmdA+!o!-1sOaZnwy(h+J+Z8H6<{>{JDC{IAo?y>{ID(b_{xlpYW{nzo4?koQ(fg)$DquV5qayW&VAE63z+-=|mOO5^A4ArRZf6ytxuyFbW(H;11p{M1%se3Q!lm>d z#Zq@2|WR%B5Pmy^kD+b_EZ(nT(pDSnk zRBACCmerdpSMx^9jCX%YA`N z2uZn=C@(Kx@EC1hU@*tRYqgdPS8b6aaLuC*RcYp)P7U=`-LP`ZFeAZe5nR20BD*KP~1^s!&7?)_S^#!+Vp8reQ89Ep&<+8 zq>`~({d%ejKRCea2qjI5{ZtSSvpDE=@Zsw$jkDPRG$fE68#DL@ef}k=qsaeN*JVd> z#GL&kS0`_j5?g7o>%B`pBSoI&cX$*mmla=xAcD^4Nd^CgWNV8$#pc$&=?0Qk7js ztc_8=gZgz<%3@0yD(k1%->FJjHsc|l!>xIQlVzC^yt4~vhAKyJTXlzCRmFnX40oT6M>iLG=e1iqE z91d6SRvgW#-P@p>8L-SY<#91q?Kq;~>=CxV=)vWOh6g#Di?>clai!s3t{@Pw1UG7KH8or6`uGO zS3eGtwJUws38nJu@f#K8E?OBI@%L9Q22A7ogVd{tq@cRmI(C&6&tUZB^`L9_tmswB zUQ|6;pl=AGPeKQ&jbbch23>cR_ikBfX~)qjOVUROmkxAfEG0SU6{mqUl?@7~^oF^! zTyxhw2`DchnkrH_@G807joNfW8g2Q~OBRbA6{EPaXL@WoS0HI-(Xlgn5$^ku82*Dy z`5u(V95s5gTn?|k-_@Oi;*1+Goa)={LZ)J$?`((*p&prh5J2zsmSLFlQW%ZLfjZET`b;`DWP`%ZC- zmVpVzK!qAHGgQgjjiz^sb3DTpeqtlgJ=K17SX^F_Ss#*&8ajz|vkEDB5G?e?G>hjkyJb5(CaCQTsQDxc z=J8@uR%)A*>py{x^b6wrI-UWO?uGx<_}0ygbInBD)0Hg}-J#Att9oy&9ISxSoV(A9N{>HN|h8azi97 zlQm2p-*3D%QU*0zLrz2KA4S91^F)4%($114uQxj#d`b`_7jQ-+UP5=-r1di)jobp6 z7}8}_$6YAcSEHIDNTXE02vFmC(x>3c1#SYpVJvjYods5`ZF}M^xMsF>lAaqanU32! zq?x<&u56w}S)km`5!6zTRYyqXf#`pQfeuLnJkK=h+f!lx=4bv7mgEHt-tt>J|4=@Ch?|3GI%u&(RQxDyk3Xs#z5JVnB+6oL$M2!IfDu^nhp}W$PDA} zhuK{l|4G8v1kw$ra*j!WEqZ^VmrqiEoGahbdipC+z*$?EeE^i{LQf5xQfQ5<^^XhU z^L0%BK{AH9pFxMu&+{FS7j_}dtobNX2++)GyA-kP-QH{1+tHlqQ~-f)74ebZK5?Rw z+-bb9E&NK;01-=w9MISqZ}+<}3Lbg|$z{*J51`FmJ1Fe`H}W!A#U}P!WGBz~l3R67 z;lA*pmEUA6WU)%tTC_aLd-Z9vkKTJ?^;nj2f1Bpy<(KrJ&xB^f}g5Ew^3_;|z ztX@lHMxNJs5$RsN<|9Hf^?5;wfJD@8_bu8%>+D}yB;>@pJ8IicBX`{pHAWF4lmLaK%C9ue4)#cHfU=KQsh8)P8%7P~-<}|wt5OY*~3S{_*!eeuK?qIJ^w->|~87nnS2*06W ztPP_|GR)vZpwd7izYINlH`%dN=FrI18U?f~#xaaqE>=3VS%)gFMJu&&XuBg{8DTQA zq--D62@bPeNGa~pXZq>Ik15S!IPG)i+2f7y9V$ML=6S4}9SRiH%pgouXsLg43+_R=lE~!)h@D!QyN5c$V#GgJ+zyavJE%80_p| zl?k8sf)P_N2!ql$K<4`)GiR|9zT8ZlX=t4C=rZiQ>Px~%Xqci`WU^tG#kQEc>v-cz+b8w?JBZ%9a z)OKFBJp=cSq~LK2aYL+N;S)7J6>T1~4pNfU{=`ab4w4omOtSP`050?)SjmzPg|6pm zhr7zghIxnQesv#UZNMuu3cUC#`uu}cb1q=09j$p5;S<-;3m$YJ{i#cWH%j|^HyS< zJM(hn;~un|tb3Su+Ary&YUILG74qz8Y@Ia*+4D@B!yFun&6kMfaB2!^vLUKs0~e4Oek@J(k1a$gE0 zA-X9KcpjgsitkFCxMDEf$2)7@|1+=p17W?fz)xLWY(6P#tcmJ(Eih_ha(65a zVjnP-X{Cgg_vCEgu7m#60+v9AK}c5;oV`r_1%_6j`rTh(UJABbAmDm8)~JzRV%EB? zle1BUVKpD9{GO%edjt3lwb0KgRs8gR2>y_J5&b8Nbr44_2(pkm`>l-@;el8cADm`c z-@kfkVo8^7T-K)OVskX(NI(!kCY0E)MG7RQY1N#k$BD%dp%v)kwW9^Lt8-@0leYPS zbXJ^V+QfXhcMw<8ksuCnVn94lz7sAFXofCO{K22jGoc)2o&@PMwgEzz4u`xVb`alz z8uXn{OAeZ<839#X0H+P!Y~Y<9m!Q-FeXU;tt_1zlDQ@G~{oJ?hd!ZX3>50^!ILPT) zf`z(z8?9+^n0fT#E4`bzIK*wqotQ-JP;#kxPWYq~?R3bits&Aw_+Z`&-cz%E-U3I^ zm&Lm|IJiY{ypLb~-AulCkq7-njhLNCsjDZlSeH6;Oc}v5nmoo4-CB*qJaPAs|0S>I%)9CCA6-FpB=adlnYjtwz-S?Wu8- zZJr<5qGCGs9oOvSQ&!}pX1nS2R3H4*qk3EZ@~3)Js?$L}yc>8w%ODEbsV9QIO>z-hb-nP|n9^dY82au;Euvj}Da=G+C#_1>mu#s%mz=zW+vkYBt^l zDG5;J5m^k4(ohkI{pN5qd_tw1y%IHL@&!I}NAjS{Bz6OW#r6^diUf<2S z(2D)})^U7JQ!TTd#lwSd-6kIYSB@u8&H0(P+>iDfdarD!U( z<46jq_*NT83)6xLWG9JOkbx7$$HoJe|6kW26SQ+yighovfjzN$c?YzZ2QE({IG(;N zpf2vYO6=hnTJ5<#3qb7z$0?Pd5H# zot%Z|ubWZsk!=Zg-svKfCK>gWi8EcPrIKGRD&V+)I{u+9HXyzQsTP$P3Yy|T?c1^x+{Uh;dAiH3sa*XLbC!_EGl2l26?rG2W@KfRK_)# z04A6SRDLi_?0qssix7%^BJK3rie_Sl4!`LLNtZ-acZSbO89`$>GgXFSnom}g)lV|7 zRId;St;R1HS^ysKUmrnD6724jRPPUUAdUuEOtL(WrXTp>x%a=Q4rW%6vW5nKK!EsG z960|+OTi|J*rvSoI29Cnb6Y`vCbw$c5%tZ5gK2FIm{Csk)2I|M=;2Hk53+|MFb|WS zCj5{h7IFi293r7tKf>l6ETH}4nTb-~09H#@6C60_!AP@R(9Tn*)Elg@RFw+@ZIq8U zFh0cAQ$xFVdV>zc8uOO_v>hZ(@EK$$!DzG zF*$j%gYOyUNq>o^#c&1lvDJyN!vn6Zafn*V!0H{sw?ckj&3LGL-T98Y6aCf#P@hQ9 z>PHC{8n#XkX(8tr@1X~*Z-YJsRIZdes_X{Rbl|KC=#&%8txvXWJ^@LoTPyKS4%qIm%L!-Sgn0)b_7F^&12+q`R}&sSI?@CYl|>f`|^@a6z+>7 ziI7XIBSL7lN4o|e^wY4RG`Er}9zpIHbmWs&8cN0D=B$=zi+CztS%fy>vfwep4x%1VYa zF?5M-79dw(#sC5_I3#c}J^@i@Dl!WlZ5gEET4k7SB2LQ#gZtU(4-5RD_3<1^S8U46 zbU;V_F#oi^QxdyD<5jtf4x)Y({4}8cqwvMCuR-Xa6jn}f%glF20Dxx)b9ApN#5yRi3K-Aq>l`=QpgC6Y6 zSdY$J0NJ&SZ97$H#HU5FPsZ5=C;CtD4(zJw4VCx;)f?|U}LSC`6%=*AmTO9PI zSLbhXLr?+8TFvaNchLYK6+xamA|S(Tr*r~|L6Og{2XhY#DQDfBVdP+r;{KX2qTFt> zSaY|t40{`uYadRlP-|!4X{Lqjv&`plekh)S`N)wQ)qu+C^*I~XbR3wm7tovWiVQNy z`e@MpCXzy~J@eRgYLJnn&~1Ai3#`e?kRKx!1jWIw%}0{{c6z|y?LvaFaLaMMWhL-l zAeU3!51AtNdO!2H^8oqTD>SvUkcw5t{u;ulZU>(c|kHtt3 zyK;1f9fk`X)e{P+#rHkdOvrPKg=d7Hcx0;;-Wwj9o4D}Ux2uM^9w=eKlZtKiUVDQE zYcAIKB&^Lpxd(oy62nz@Pz~s(!h){jrZ2D*-|+c{D{M6-RZTK=-j%t@CV}p${rJ6Q zuoDLiN4Y2~%Lv3zUvY;-F_qx%&Rb0j0J>uHc9tmn;0BCICX! zi_U*iOpXR%7y2R#)P!{3VQSoD?H5FwKox2{n9Z7+eMKW%bp@u|g|wq2_dKmLE5c{1 zSpm{0S>bh>L3jM-uhIwP(^pv6f7JtPZ<#?2c6~t^1uP#_ZXW^mWgMaCxCYAuZnC*0 zxvStva&odXWfUX7wN|?8vSbO%HZ;z27T33w${q-*bD9U34&?4U?cWMzZr zlLCtb$vOe@$bY09VeDRE^dXQ7dLGszm%0A#7+ih0&$YIl8B_(c1a{^1>9yW`MVOBb zp;zvn$l`bE!0E3U;VTcPPK?o*h2iNBPm*_n9XQAR9_C{o5eLp)Jmcjw+(agEP29#C$0eZe<04Pg@^rN zpN;?2qxFx0D*+qc!T!I?UyiVgo4w%7K_YS|VBf!&H@yXp!1Xe6ny(+{-55ge9wY49 zP;;M_V3B2_=w=9dbxIfawGw?C76>#0n=FAyx;kiiX$hw3i*(B`K#M<}DJWzF(b*F~ z+)*3&n}bDt=pKYh2v5G0KgdPigCyTLL-+jWAaI41IPWmakOaM{bk)Zo+h6P$gxxs@ z@ChjlT=MrQK^DofC*!ehj$gfu_BD=TRLI&pV23IPoBBxwRjWf!(H5KOsA^PeiW9|k z9hQ-*9bjrw@stw%%&Vz5R;krbmm)LPJO1|CF< z@D!r6&O*lfbB7@m4)mO#UPJ@%BZX8E;u=O&P)RmHhk!!Do>RZiu`t zb{O>~==SWOo9#hbt?Q&);Byw3xoSg|?dMmoLgAU^8R`e6VEQ_- zNz0k2=?}<>FH=&qPmiH~Lp2bvD@pjk&_PJJ|51?#9+~2|3xbYe4+X{a2P-yUa)%PrDe1VrZ3Z1EHUi-CQ@PdQdQJy^ku6!7@g+1;jTXp2; zlkX!o9Cz$OM0F|`!YnI_9szXW4;4+)$bIE#y@iMHqSlBR@`F|i6s%xuCDpS7ml|jK z2=&%q+TrjM`2FV>daz7c2gXm$a{XhTZlsfwYKHF0T_oYDKm3~iN{R%Gg z;&%=b67VA}o5_E+8Dp(J$iw(UM`{IEi@A4eUjX@jHh4n*4;)6UUyJ8>lqABMNobg4 zOe-}&fJo3VkXH*#88Xz<+9a3TQ2F)JBIX^16zNBe?^+f{xs3s_ z>SRq0OkhIX=0Q>JPpMg{w%=O~<~xphgSU@}^aBv)C`WH);TSG zQH`D$3yixmRpJh#vAoTZK+q7R-)k4MQ`e%&ZuXtDe|yz(rK_zrD<8&7UY8t1No&`= z01|*%er&S{mm@qM3fh@f+Aj7i_Jk$A#=mTSl8@vfnHun!gYVJ#JvwN-e%DnI5K3#* zJ#+$0@(EI~%h?pG7dNkWrniXC&c99c|2iaL@4xQmdcC!wX??`sgk)#VUDkJ^r8o0) z?owe8%;SW+==r_~UR{ErprY7D(4Awew^Yk*V)vDNtgn78uyckCLskTT%@*sh7eUt8yqP-ayZ7-?9=jGtA z-ZLfPvCZd;zExEj;*?s(W&||0p%7{MD#+(bF`C~?P|8qfeE*$;hJ7V!`cQ(cR9!^L zu^6@hQ3{(yVG(W*T)bn3I@-Zu<_pvhA>B4V^^ji4in>cNDkbrH$Q^LTXk&lQn;yWk@N;OY=3zVtW|ZJAD6igI&Ak9UY^Yi80|x}%N~`- zXy5w6Ip@P#&7w|P#GT{H2lKhGq@>u~e7Z;Ff<|TzL$Yg)t}x)8$r34E?@S#`E6|$~ zvbCUeDs~aVN!VAhebMYR^%J{bsm`6@0&zxcND0f~bW-jBpi*&`v<7KF1 z_{dMSiag#aEgf1xQ9E4KbF{a}8UT5WXfF>JxV+`)JRTjCK9>NKLVt3(ycl}lKpJGA zNmLx}cptFBFTb%H_H5dqbUlaolFEfIPRjZfFlCd+y?uRGk=H?hv$Fs)r|`2JHiZzp^LU4D+D3PX zli((p?nB%O0{hK-|6)nLLCp)D+sWS4Alv}C#EtVhpcQJ38d0vSnG!P{ueE;uK6K*5 z5D-RByPOO3`Wst^cP4%<^G{NasL^^k38a{yQdTF8cVkgU_2b;L<#t#P0nMD(e6F?Y z(Vc7PC^2L$xy{ck1`VdiMzdG?nOaK&caKY)bg1%Njbd8mr>U|hdS-LG$i6+mb@R5O z*w>aeDREFa=x`APPrkt0(O3(X&^eoe>I6?evT#Qkr>rg$Jx3`JkO9z#Io(q%Mg}H= zUl~VpOn`=txph^a;x@7jE+{pCS%7tT;?h$iaqto zEob7S%2NORyP3?64co5n(U!5=Ys>?L0`z{v>cPD+%6B%zK;`#q5N%2>&dv)MQ15q>SBSZZyIpYoKR zhfNsJTqXr?xTH|MiLzG8Tq^P*U8`yO&a7K~?>IqR!fZ_pzx0Z5OqT(_>g)ZN9UpQX z5#XK3>CI})LN!W&6S5Trbn%H#9{b}-?jNQ9?>{eoyP^@GQ1muF7)^5>bT4^> z@**I8iHkBH-h;u2g{+vY##QS}dBu$7_I;Vp04FOuSN`d){;R5i_$QR#vE0J%-$1uLa1rMowVg-P>f7wD$PefL~(W0t&6r_;wp5Smt9=beD2j`j%#eshAA=^-QS1@fXBBS1tgFAzd4;O{zLtsoXOGi z^zCVy5gYcw`ful7H=VnEHRGro)%04ErE8-l-d@lVSBhYWN&Dbjp8QpB8?_+Oy0CB@ zxd0lDs$Mgwm)q&i0V&sOCPlTqme}u?`ZsqT@jja^5|B@h4{^zSbJ2w}kBzJv2fPmx zSI<4oL1LEBi;m-uGO$+1Jderw)Un}(v*?B{+AjnP`;i6L`Xhtp)j1?yR+Ue@wvx-x zo7$+C+fCcsVEze$Wznig)W&6T=UzYmY_iJEoW&s7zin@Ic++WoZqcs6i%U~!`L_9? zV9pFh5*Yw3Ng$abMr83~6>sA1X0j%|w0pYTsl5Gc)1_lkJ7qtgPB1UT@4BbWqE*GD zUMO%?PEh_nN~_vYro?QlOIR-axfbL?kPo^{JG?*(?_~<&p7FX|yQ4rYM+=N==ZD+E%ci7ms~+S(YPJpHb{hO%6g6*Wcp5M`*JPV%#1#lwHa- zRF(i8<)#HqL(uS!a4_PyTo_OW1bMp^^Xie>%Ev*UUc1<=XsxT9CvlL;RCZAj!oEW` z1?Fh&-{-cU_1H>VSb)D{(bKJHi#wpNHm3VhmvMVR9FU(QUe{m8HCH9$SbS5&P*xjX z*m2(L?9RIa%Bh~Afbf#Fr?S|7pMPek4eFhb9Y)w~#0HS6Pi zr>WY)Tt3<{z;xipi7Tg%Bwd@lJ=$P$zqUj{3IEHo$a@IMW9i=GN66+-+l);zJBUh& z%;I=miCIQ(05tX)S|#V2m74C6?4v>#14DNRueW_Z$HKGcQ78FJUQNc-cV=CWyCr@cRHDsqVt`MD{uU3pEWturKbCD zANxxh8>i0$;hX+lv|MHtU{C5df0Lb6!=2NOsOu6!yUW)37P(Iq)zawJkkZ#K;JR+~ z+?Bw5yowPg91z%jK+s>j885%l8jVUZt1R{1z=&-140o2d87qeY@LPszR**o1p!c+c z8$d{M-;uyiB~W5i?`J6?x`dNi3i`eHZB83*)W&JI=GE4r; zyxp48a($6@%EIeoy4}*F-G?Q$F}}*J{e?zhXQb*$5i@wq{Dh%MjF;>{y+dy%NW=uH zj#NbxU1nPnge5T)OKy3GYJp>ty5-w9x6%eeS=5!hE-~M5EU(cFZn1wl%1!Wybq87G zMa*(Fv^X?{J()0A7>5yGlXlzd5L9KoOb-Es)-G3{3WJk+LA>ds`QRoQ)u7$Ha&3`n z>y^nB2{GvyR)|ox6-Ee31(`{;mhZ4t;voGehtD<9V{F$u2sC&v6#b7Fi{M@?+I{!1qUN) z>mBCld3RY5;ut@Q>l+Na#!;l_j*u#)mO6xzJF)+ACn;j?JRpP*yW4@W{dcG@-UnDJ+TsSPwmG=fv zV6Q>Bi7c8NqQdG?suXCaF$`@zqAaqao9?ceI?-REmnRN7DBQvpg+{O43^uo=MLY{! z88Fqvi0i(bowKy_qPSIvo=j$CQ7F-{Ru+Dw##0;`%h%!ZnTrQws+m_ksA!HhGRtzE zOGpcn6H0ZH2H=OAnHED;GzOX69IeB=edI!e6Pa=gT2j&9U=2~!x)!E}GMgS0AJx(F z$BLxYtxTFJKUS^;Jntgc9o?9Gizli}q8;MhBdGu(@{W;+3U&g1dkiK)7LG_ZwUF8;EbN(_M5NUs*I5NDO1EE6q3F)K6@MTTDJTO&)fTI9B9K8@q(M#}p8w4-tT=|Wf+ z;@WVH7w>ho9;uuMy-S|heCv{tP%ZXz7hVArBOUR*Mx1*q)j`m~V=8aRDPRRsC71yUHzO;|ABT`!<8Fz;|H z`?iekHV6BZzJ~QZ9klUoNLkC=!e?i26rk(u9$8r5xP=_fK{>r=F?mLitW7KowIu=4 zW>?+HxI(#2mE^|@=a<0j6b2J{qmkRm-Fp=_;WrD$kuYDW&w}bQ^z2nUe{vcBMp}aA zbA?C8W0|!}=~>GAB+$3BqRVeuL{}`VH~eM8qOQP6IZfU+s_h7^xejMVUZ3uGQG(0# zJ_XC#94>0#>nJ<(+WfBsiB`-;K^Cm*{ZIoNVj9+EIht!-J-kbTD7j~$6Rv`dcKNtd zVz5#Ku3vG6#%hSgTQ~WOQ6mYlUD4+KlhZPwhcLMI%OU=b1J9OJCFO*wgb#8D|!k)H*7sp z*1Iq0<(p5~pRVHk+H}19ak}%DFI#L@b%U*42}4=qU8XI3I;qVpW{cHBIIpoSHo4MF z4bKU5=5%7gGk1h#Z|y%*>ZmMQ(pE={P5qI=gOP4Sya z!E!6)lzsiCx7<`XNdI(2JA*q#?Q_&jRse%^l8SR5G7>0LO$rzPO9GC`2&grn&m`j- zxclgSzyIAalpb=d$P-cL=aexvwWP4T62@E2EDl(KqFYqouHbg$E*>NV)7|cCVWc6)=f>{ zPpx!GJzSereA8p59J14whitbuXfMjaT5njj&wsX_*>>&|z#4CK3ps1eJ2=;Ft%P?v zfu>C#ceh`ohlb5IhCHF6ZFJiC3DT5p3I(HhM>aqqa@sxLcs3lc)sBt&_Qt)k5l^S@Kb|p```1>vYL`9NLuJ?CiPFsMXMUyQ83Ys_Z7cDc`DhN-*uHYAV zN~=%R6YI7&tgReZI3Wevk^Kz$?B~Oo-D;q^eo|wX2j$rHsWGXSxz7PJ(A>C#{!#bB z(&;`u1-2q}=#}Xol;72p!192a65dy80Pt!AcXp@Ge2V~}|KhV_`Ymy4VBC(#Ou4)W z6f_K(V?%v3E#fEc#p9i$2rW_{7)-ZV|3WZcryH2=g4r|1IK7)2KT}JUIn&#`E7s-3ppLZc>O}_M0Zwyu5dr4Wp^Uyxx-W%NvFr9>KVW=r*MYQ0;0NK z{VJ|M`UShfp4x1~+m&_u60>R*0nUwWsl)PI0qD+dNVvf6-<510x;=AuvB#~FDrd(R!T%2Telg{?cf@DMKODf;Ltc%v-QLlvaU1AVf=@pP=|oza{D(n^^`eO%*K+rrvC-(H6VC z;Ri~S%$Nd;DI3$4I9S0bv14|1YPNvG(t_=cGfF)MbvsWRzK3Vb(usWl4K1`P^dV}$ zpt+_00^T2>F|O7^jInj*{MA4?0PgUVJ-nR0d>LTCt^pblM=`#fQ3nMyGZLvRsEiM@ zt-AXsd!hr}yX8=h4OWG#U|_yVp+(SCKex=Ne?TeI75Xl=f7NvUpNHK*_yN?l_8%zs zr_fzpc;{p@p)>NytB&~h5SariX;%|KiFicdA3P0k!M^H1n!rr(vOnY-@C%P#tc0*) zDjgpm{V6e836YbeyJT;&Rpm6 zBd|2#pi?nCb`oH8&FmAzGud{^6S^*W}uEO~(4B5Ced z{-E#AaX;%WK+N5R#Lxa}J^WK4`WR@4KwgbmJ>;qnYL7f={1+kl@dCg7|GF4}JX#(1 z*0R|(OjgSe;^$BtYSL)_S$wxN$t+d!N1HIe+S&ObEy(l~<*D3bu4k6(I0~$C0n`4q)3KEK?4(Hq9o(BcL`d zU~NB|)i8lN$WKG6vGqo+?Z-vaP|I66-XE5B=*UkPMiEr<1~>SW7YWVtUUgZ=U7H5b zFb$Ln=P2>2H^8I=R1&!I_{{pZi{Rcd?ho57VD9?1Z7tysh|mH5fd-9?Sv=gne;WKg zm@MSDSS_bR&pzQ!^H{yr1lyNpIPgpm0Mc~9>KVBne-s2`Q$OyWP~F=)^;gfxA3TK* zznV_&{pjfkzOqOG+6@(_2@jhArUv72jt@=&-7pw}H*gfV62)}m9)FUsJK!Hkaa^O= zgr;;sdw&Vm`5;B^SSB%|e~CON4uM z9%7Q>qIc3OzWX^hSPm4^A;<-p_u>Q@u5=)S0f@=k5gQQi_gc&4T z3K%ml4~j+iLg`b)G_b+1*e4D82`Xz*l~iHC&eQaGszL1=`Ws-b%K9hHivU1Th>A7U zya=F@(Utuv&A!CU34LUttawJv+OMI}RI#Bf8lv*Z0Jl9bmZ5+PZyXGKe7Hv=7bfk$ zwIsTg?ARuG`Af@}85kq7v&K2%qDWk461#={AhJvb2PuOkASvt!OH3t|L7;cBxys@Dv<)!D@)qQGdH{^PQ7J&T$!DlL(aG~vCf|+IT%DhYb54BSSD)>kH#B2lyk$vY(!a_H zkS(0r0e_)MsM1#Kn`WgLvix`3iaxp&JaizUZx?%@=LHV~J-rX@wh7!^N=2xRplYbD zg;sbkQEcfd8AEhIX!2@Hja%Q{rttbS&}{;T=bLXEBG1P2=t$tZ>Od*QZGI<-Bn2>_ z;DZ?uzjBFtk`<6eJY%r%_$SPp><`lR9{S#_nJmd3vSncQ=|{KzS`Y;Ug9(K--aXBw z8}V1kJ|oV%DJ7T0yN#W<$>m(&)As?BLzPyEqEiD)$b7a<2OXB*78=CvUTH7NDwkj5 z)vz6P*;!3t(_2u{#)si{Ylhns420|gi?mVn=pjT;aXik0@(Yw30;4w?M~(6q1N8t# z3W9qs^0fQ@>IjU0d^HaBA4sEvR0Sgh?ax)H`7jT;X@8A2Vft{1Z!Ed;2q!5#6d>4= zXM7Ekq5P{=75a0iS)~$)oX1nY6A>!CuwdYE(<(^7i%^9Brd`TKJ|(a(COzeNN*I__Lfu>_Qy;zMz!rf-RPP5s|DgmUw?B# z2Ljo0uYh|e=!Zzl^+XQjIR%f#a=`wDhy}KjQgvH>JHA8EpFVc z(WX|~`q`*6;|PXovKcO4w<}*mDBRHMd#~ElM{;B0$~^WPK;D%@K@nfT>IgNET)}4( zT&ci|XY&sW_=8V(xB=+2+__HHClIRlBbXbgu5n} zXqXuT!;2RwZ@^?(o01ornhfB8trhCaJ2eNt;<~M*K*poGR83O^`5oMQX&c{`j{6Cx z%*nQ6{=Rx877I-Px(G0=bW1N+zluRC{4$}O3aH8UBJ{5bmK4j%rS6MeNLn{O$ zMOdvXz{QWU=u@7~MPVu#0*lMTRbcW}uEX9nr;5hlboZt3XwWm0*R@1TRTw-I8gEy; zunhoWdSCBs+4ox#z3i*7PZz`a+VwR6Wtt2i{stq+LGl@>R}l=#c~j!slWv10ij~DV zNTl~vmMO(iHb?Ph2Ns5EzxxdOnV?3#asq){U^vA^xp-D(>rZ@m z!&$Ie!*}hpe@d)dfbYBGgd`B4-vSyz!mL$6UobrVR#bAkKLM4kSSd?X_l7BNpx8;_ z90f0P*#>1unys?bQsQxN$!pw9N(rfxR(iLa95exY#RJSAV(XicdedWBJLKl2-#*y@ zA*$0mRNQQ!TbFr7K|5l`Oj*8*M!FI`HFv?&VE1{Bd2OgK5GZ1Bt%|##WuJT`?;->4 zL46qWO#xU1_0}@~t+uwd+`X?lxlo^*Gfp{@+v({gm-45M+kP`Ab^x>gnm!c?IZ)=0 zx49wcIOfyppZXM4p~0eS6Mh$JAdqu8m=kZ&n`2f9pXTd6$yDVowh%V;_Bt2_!jhxY zMVD>1YNH}TE700mjQg<3%svz2DZ6d@uxXfz5lcYM+E)yF53B)9F1}5vGUVFRr;|Vo zK?7F(uMX(;zp~c%jD9z9KSE7mVaPteGC8~L1s3Bdf)sDyt_M(v&N4KgD;V0WakqSN z3*;sZfO5}uRdUR@!*pIrp4b}bY~l(!2gG`fQac=PI9&=Top%0DwxPuvt0Lm1Uc)T+%g_%O<$xtz7p zw8%nwB?Bk)#9n_H!7VzU9fGey&!iq)I_ksg>s(#wJn{KMB+<9qiI-)oVyJzdo)SMU zE`Iyk%`eXn=Or3-cd+}OVtw@CY2ww9Q+w0vhP_($j~y_#`BtVkERkiUrKK|pO=yQj zH&nVChZUDjAx}DAwdPUTljkGM4vQd5ZgvW5fvZotQVJcqUION{cxM`X=6(;m0P5AD%iol*q!n{n^2TelkC`>x=+H2&OpAlW+KBJVC^?sw5Vd@0;U zCE`o3TuqJ2uqzbw@uafab{&@EzS@5*&MVrcNCokEyrd?|c6ufv#YE-cH*ee;D?0Yo;(R70`oQMc#g&T z=qH|Wi~t+sFL^0(5Q#iY0YQ}roAsP%0Hstq-m-)D-%GWtG`rh zT7+T0>Z$*76ttKJyx{42-n7tRK$9&2A!o^_hrl3-^Kkl*NX8&*u$a0#B;Rizk3H>Z zgYW-ue*AkYU6%l7+>EJNmIAg0l`3Wgz4kqHAbbTcctJq+DW*r4G&DhhwW%-%5c>C> zB-ntbLv7_5DgjZKUEEg#zz@B+4%qw_yV{LWu$Ulk=C6-W!-&3*k&+%H9LLfjp^)JT z91jIV)t2#U2AfTI1YURA-t>&^BXIEmFRF@zUIeKdWuZrNz#5(kkAw54pG;PE2ZMXZ zH+H5dzz>}k2i*R^F!yftwJF6LgjydyUxL$rt>Xmu8-ZQ^_ce-;Y(zbwY+wLHEsr`E zdkviEM{q_&bDhK!AHZU4wS7$#pUGBn^jAy$P6t5x`cvRUo9|`q0CB5d+-l>4*vbd& z>20I_8$Yquw7WM;pL)?vDSikxGd^ey=ijA*r#Zjzhp7OfI&o6K#K3Ex5dk9hrZF{o zoPrTel%G^r@A)+gfVkrwU^SCM|IhDd;$qhYsS(Qw?;S{uptf4c4rHt`Pw=>4<1@j=Ebzy-fo0rf~Sc-Dz@DJNg|+H zX5{ ze_f4#Y4kRgVH;&D>(FZUYiHnn}ifpyn_HPjk3XIgmlt_wn)AIHzxGjw6vO zdSma9?F8>zroALq3|Gbls|~```TMeRH5XvKTO$ppmg*5hz8gZ@92N0AbjsR~r&aAz z4D@{cd%RO+qC?W7HC+XFqc5*~LtgN|4%@LviugYG#SzTt9*Jeu8%5p=*7NTQ3zXqj z=Tla9Pg9GFRcJNYUY%a*v_(}>A*AKv2lYQ!e%WEtXlxGv10G1@!0J9n=Or%WBq%(T zLx`N!ZcV$I+6JZrAHa(bk^m!e)~Dq-gOjEQpc`e1KtJ!u_wwR3L$O7^3W3v6Wo{cs z<;3INd&aZx$7NzHtEODm-{dp~Sx6DrFgXZb#GM$538O%44z(d#Em9VYSb~g-hMh;} z47T~o5;|6=*%Mo;dL~8HW9>2Y&HUe<8V?j(r*UZlF}@8>hIFEw$O3gd*2jM4jNw;c z3fMzWh#si$NFn{gDuv$#NdzFj?7Xc~Ic}(iRWMIV!6vh4-IR6Mb!DvLeHPbVWpCbS z;q;n?gAg%^HW#CS)nKWTdD-~OoT=+)Qd(#k#AqaaiJL}0_OtlEP;ZLYrbS4dh|PUb zw}PlUmQ;;cv+$$f7StgBj1nm7JS|UjJl3!=;sVLp%f!?Yf^mZVyo5G0)}K5gUn>}5 zZs8q1a{L)aS!Dr7k#s3 zgmNdTDsHL+_pbqOV>DSuw?;tj6`g{+zy5e?12fMxVlBsyn&_H1XI@!Bmq*Tnc7gMw z$hJbMj5L7*)5f@eXKxJrV>A4&y0r2zD8qwKgZyWeEoC?=hnF zqgOGxBFGEW@vlCLE#6z4e}j=<%-7psDMIHz6FPR1!x&KFgn(OBj}xFs^ol4MkiNG~ z@0Jb{I^jnPfb%DY9-LW&SOKgL7cfO(g&=smakwZ>@D+I&=>Mq3BHUXxOxq!82Kw@?~Xm_jYj;oX;~^8lBRS<@;5w z49z~uXYv`<=3)+c9D%e;LLK8vaKR1HXGW6s%fL@`arcMZx8JpY7qC9*0Z~TFGJwtA zd&;&5z-98y+swmS7G5=)-fuBd)>6+K^s#REI+QmccY4TqW3+eiyY@7iff+`}wKog< zv8W_U7iQ-e7d!R6w-<0p^CP~glQb%DsnbCTU4(L#K|>m5O`&?VDM)dkv{QO^f%~Bf zQHwMFxDY9zR{Y2_ss!Mq%jFzCm|FGVX{DPh!jzEP22$i>o_jvpB-+957IK3^w}?$S z_ZW4CUW5-{DjHlf&Npu3k1L<_G43`WQkxlA`=DSgTRGdl{DxcSQUgl|mjFt-bvniu z69?JND}S9B)PI1xKhiCQimObH6o*} zc1o9-ah|O?50kCw8arPBhv7oSSfSlbYWsO-sZzAoYJ2^JkrK^78@6G6ug?+SH|S(5<0%&*#$rrMAgPye56MQ;S1toq3zW&!Bpuj z&MYC*&6FI){wkZ6FcP?wr7RX#X+3ZS*D=2uG|OCOL1;t{?!4nP4bg-F=l2lM2}dHn z=#oBzLK^$W0ZLG?*Y)mz-JQRBn;Y!b|f^m5EnmXX}sLm7!{qGv+V zS4QQ8B#Wq9yyb-U=$RIsmOr+cOs=QQ6oX*xF%US;Mkj_o?K3c<`A(9?4?vk1(&Ilr zP?>Pv1Pj#*Ax)PA`}}kkW+0SyPe?9+r6T6_DF)5}F5kj{7wimNp$<1P=-l`hd^eYn zI=U|m`cwOxXj(!XLe9d_T+XV}(d#ZZZ?B2%>@9jM ztrW2r0nw3cv@GE}2S7&W_#+hWpjcBWzq1LVt7{CA+w}v(FB~|mE>vYvj~{rviHBH$ zfk0?W_bl2%5jNKUa@AYH-&{*Ydc`X9SULmPj}~uBmoIn{{ocg{ANr?PQ$Zs702}|{ zDiD|BY;VK`&x#EJR}H1+DQMpuj^%y3)($=SrEKq z<}P?eKmt0|5PWG`{g7b|qF89Qri_UOnpye8!8|(ofS*E8>`6LGkoolqD}{LOffk^`OubXKHrUF{i^2P555PN4uyTqYxRoQ2 z?t1ObxC|};Bih1y1Td-R-$70y?Adx0=8SYKj0uFc4(p+84##6B?k0=IK`E52W7Cue zviJ-L+0rOm(Ye)teNAH&J%21I0Bno*md}jD*9xzAZ`yb#PHp%3!3ehT9#zviuACwD z42c?wB0v0YE%bowNmyt@;`_rOS;Co6*RFJsApJwAEQsUiQPofa>?;BgUhoBsBtpfonGgmM>Wb=$9i*OV8?`|w<&g+!sBO7~g~KpAKBnpb zvUp1+huN=!Igc{JN@QKXFB#?3hzPXLGE)?JQ`S(uE(?zVR2zBHb_h}$Uz4ZV4m=!F zHL#K}4^AE0BUR+^?fEP`-D;FjSCnm1wTx~B{(5@BIwxRwkC#9wJ6{`S90IrZx#%UC zZRB;liE$e7SPIlCiZf6&iWB0YbR5Iv@4S%A_Md2)Xz02CuYbs6k^tzB42Voj9Eo}x|17vVcZ{_-tV?OGD z`0tRF3wXn`iL<&;27;t=cU5NG{%dech0ovy9}T)^Wy~|Fqj}zT?O3eMC0WvU5U0lS z3?8funIPW3o}D@?{Uc4%EXc)@9`XT?3PGu&>0qS3@#-j_c$rdI#`lt((dwF~*QndE zUh|b2P(tAJtnzsEV&xvUEWgWF#z2l}hdAT5ay7;w!*8liFH-b6{JR6fn4+5Ga#$|!!dYj>uwdYj2Smv74jR5{lD(kxwm3JvXV)biB z%igOM<~aFVpIOfxNTc)b4tr427Q7k{Gm$^aWb*8g$M*RMt_knWc}@}J_WO-ypX1uD zYq3-AqQm*05`5*nWw!OYan|vvY=-QFWX~1O?0#o+o?8K zj)~85dMHF(m~`uNz1O7C-v%-f?%FyTc_s#@5_CZ~Vksw;aoX zQu`k|*HQsG??n10PS+7cc0~p7r077gk7Xm1I)|K!Q$6wA`}tJsW{{}tNR`PIcR#Su z^B?N?r~*y%indbFOVFvlbIPu*V5xU1T*{966u;ekt`=M0}R5x5GN5GxYAlga<0r}VhZ<>w=rm4v#axbaDHnPiwjN6nP zZw{?Dfmr0s9&UDa$vXMdXDCwc>%R579rCyX3fvA4P)92Fc852|@>oVQtSSPD!Ys({ zvy>D+RX{ht3jfDa))IKgIPh?2t*w3cgw;GAgiXNKzV)^wHFy{$_w=0Qq2|)GS_22k zemqq!o|3Qv%gUshY?lE=+w4}#qvp_|*P|+{TvwV2vq;nv28a#0xv3Z2t~Xc0jkgCH z0*BA9G#eo5=bwI3{5tb?>Upxk;AI zse-@Gvp92kZOS&su#9LFI1ddL_gm96!(dG0dtF6s;|Y}GFOQj(Qg5 zr24pnWR?HAeXn8g>BX{t9pf+VN=OGkr$nRxx!C_K!&D2T-xpp1Kdu>dO#laM`zS6@ z(4UpcofM>A1o3%ka4cn{>N0`T>u&;Uj^nJPqyOyYu$-!4& zl(@w{!U+2VYt=-7Oz0Tc;!Fm^M9fb)N*bDd$d$Cfi~Z^4P}M2h6`flkJX2^IIr|A1 zGGZ5f2`&N`&$$aT#e$6whF7b*@E6&Eb3Us@6n--YT>rx$F@f=t-DZ3<`VU) zn`K@i%Px}0*Dxp<0FZ+rN92(;md?Lyuq+SyRU+LHKZ9? zug$of9PAm&XydmWc9v>ZRh_B~enhr*726ZoyM=G>H>jx6Ytb3^zVaH$auD?4vLFr! z4jka_QMSo#)=~Bbv9WCYw9VHm(`d63F>hEk1XQf<+3lbfU9z&U!8;kdp@A))#!Z-UROoHpMf}5Kwo7?@1dLNHZ1p2cLH#i9GT-tM5 z!L9BB>!`569mc<6V#xQ=Fw)A*SEie(3!OVESCO$-juR>8n+TS6G~QJ;NHNgunJ>Jx z(Jw{nJn^DuZ*Fo=9lc!2*OAwE7Dh9y+OuGUT;yH(a&(lc%o0~w)Uw07vh#8219eO4 zaTb0XixOsbGrj{#`o9X{`y(K`TMuFrI_!!DsE&I1y?Y-HG;ZLEGdwVhOQ%vXc0=$o zi3^pDvbz14P+9UfzZ)U)>r=pn_C6Zy8hv({yE-XJhu(!xS<^l3a{Jz@D*W5=`-8LY z?Jt%_7!W7Yqy{&K#dEJWcag+jY+x*KbSlcwwC_GHHnQs#pZ2{3xIJVWL#_{<5rdWU z;k%S>QWme=pYjG6G+tYvxIATx263vIj+pnWT@C|J z+)23@`UA0S!;e}Tiz3KBpNOR}L*>|D=(7*7UW!mmQm;?BPlDr`&f> zdDEml`JwA-9ZAYFiH!mHL(8>9eR7bPqNypEmN2Vn2}UL}Trp?}E<}QSA6ohoZ38pf z(Qqw!!`1*hUm`|YTh}e;o(JyqYt1(_G)ST8W4zemz9-rRwF?MqqJ{ zE3tvLdIOH1D?>6b4wlVR&Ke$(uqA`jW2V&2vdNh=I2bssXs{8-UUt_XGZJiWoCoRo zNKLvkIW<5)|sHgL4?)I2qMhWsC+Oe6^9g zdHMx5EqS);(IQBXo}mjpgK(Yg-5!bI2(>qvON?$#lQ=ouVKJA~@uST`f_+xXutz)% zM^FD|oItij6cGI4-Sqw1dGNF%?No@l9T#U_L9b3<7SH|fBULKH-V3y=GI-Xah6V{Z zc8}z_hg?(e>Qi`kkc~5>n5wyqi-`!$v6iyDTXl4RU~k(p%#uU)86`t131( zhPyx3#8F{{H8B=@?NSE2`jh!U2GGo<%n||4^lr8n*XbAU7Mc#YRSH#pjH9I@kF~ed z(dIPRXQBRb+R~w@f`&PCpJXI39|N);rBM0HE29P1)>E?cEj*Smai2d!&?M-cqXhc*ImpCD}|(EE@?RMvt@N)^Tl+FC<osSI*O0Nn`AD-$tyleSCYSMUbtJ5oSA%G>;uCVtyv;H8;!}(0DAXo|1N>^m?9X z*ZnuoYvW26(<2<>Qd82)m)5=7=4%B_Fhr5_vrqviWkF>l5mXV%*~@A`&R%`N^dTn^ zJl%Nbm$L=)!&;#o12o7De^3jGB}=OwIcY2zpevg*pBA`<>vc3x6YNTo=k8fzG!F7~ ze-$zJ1u!0c7lE$mn2gM7)OH-t;^+){&Q#HqS-t{_u{IhpGz@H7aTqBE7myalM~`H} z$qMJtbBo$uF+_Sm#cNoVh-f-D$A9ymLyr}WrORy$gW_Q@CN+DERj=A_bJZ6$x>h9U z$32X@HLq~dW_#U>%2p#W7T#IWR?x6cLzS`aq~s|pNmVLIUcSgoRDlA-BU<~G`?077 zzew=)dRBRtXHaP9?$B1m>Mn2Op?>Dbs~R$p`a=0rhN;nH?l3Kx(oZS2ZQjq8;DS;b z_J$1=7z#U(Qn9RwFrVIlil6K@a9k}}isv3i-JV)SfohT-TgJc*NsN+%Al05s%#-f%lKC3? z8K8f+9%p>y4)7D{84k@21;vv>hkl#Mg=w51l8Us9P6cdmH=z#gE;iSQ7$hV<(v$trrf)*K$r0QW>A8_lRW2!tz7yu4Q(E>)W zuX-nJaY~VMio^j0UN`)5_H*i`6x$G(1bGD2-yL_R!^BsW6v{XuaGudz3GJFyR)rDVap{2?Dt<&up zmcEcn{w?MPceU%L$z13bcO*H^nTmp<&nn~(E-Of>K*jETGd*_{Wa89%2<6ZLc}$v9_g{*4hbJ5o3){PQ5I#roPJ8 zw5)`VP@HT)6PLP6*hZMEgLr?c0abnW*Sw#hMP5~kd+-C5@S`nv8?vuFkuWbFnkm8l z5>aQ|@+fymRP@{?!;F~NY1p8h-mt34R*KDNbDVzBuo-}4_)$6;6!PF=J2qEXr#Rr7 z?=IgqcI?}$S){z|%bfcljg^KZDc+4UV3GAIQ{{Rt@M=ou4bzf-ciW!|D*?Yfojw&x zzyD*VI!lFtI2}cysk6(bEAn;Z44>UNV|0YZAaMI6Y~H08GfgkgVEBo}!ASJ>I0Yu* zzQys_6EO(a{9$w4vUcnlHKj{z^@8w;EzeO8$vA%{Wp%EF;@xs{>$G8Ad|2pNj8guH z{9Awoa$YFlS@7GP-q9JDjObmrA`9Cl4)XOymD5;KE#2dD)4daNW2)`Cgr66_m7dAzz)Ozsq?A9<{nM)(Z(*nJkZPd@rJfLJH08@&;e6 ziEfJH(f4`Q)o#4@n)>Y85Q_{|PuWqnej5A6Mq3+lhfQtk588YN_u7ZY`uR$FKEP)` zi>kGm^eyiBua{B2v8*e`RL_3-a7Naz+MjH^I@T3YHt!%M9m7KxgjLaYlwXJ=+_$5@ zoHAQ#fTH4dj@y?n_fWs^&a`#<7Yqg)bS!dxSkDVWW$5jw5RQ!Z4EbhcS2Exm+#s4^ zn+h3RW^!8ez|N8LnU~#TEcueS^sSd=#2bMQ(pb7#q7y8FOh#uuzLq*m{A6`$ki>h?Yasb0>R0!=?#e0-;?-;)cx5;LL7? zOcCU8DvlB{|Kx#bN22)3RR78=RkO9w5hya#FF*oq*68L^Z~-nQDp_f?8!W)p|Lyha zYJ!`CwvklF~~Lm ziXIarhSDw^KrwI=+kU30K#K$y%}r@KFG9D^Whq`6q>q3)iScs+RCsiW9*$d@D|@1VFdzOj@!3;?QPJK z>8BZ62ATd6e}6Y@XJv&&LDx6-+3IN1_P+u9196b?qJNeqyYHI< zme=?&3hO76Jn+3p5oBPEe_->V?0vo`^9BNJ41@&@b~n}TO=O;V`DND1?KQ)F***hh zd)Rubz({u)rN5E3qd=e4E`s%I`KXQE4=udb!rn3a%Nz}$=jyK2Z1}5ayU@bS*LtRU zX5X1xFKl4Khh8Pgmhhn6zZTxtou=s@TR=ysbs<3^ob~Z1f|A^^8`Cw`gBy(&F80f9 zS{+K(Qc6+2zWOO|MS$ztZ^PWLIKsLUW!v6hUONR;wX{l1eK+0WMw5e(PhK-skY$z1 z^6rNU+G^tHOtQS_=pY7B^ztS?YYV@OtxRJYLC=t@j;3`J#hd5GaR}qP3ng=PV|A6) zhJ2J!+Hw6!E^|8@O}RxAGu)WdteA_eya2r+rL0KGu^Lw_*wGVIBOkf=)o-+{Dgzbh zJSt6zx|4XqleGup^}>`*Y2$J<;huN zuHo_={r)Q5$ct~2wn@b=!nVgcex&()Ul40v4byP;R^`b{(bOVWK+5{EuxV`K*8B5U zWAEpy-2g}o#VnPU2v+I}>~75_jIhsi$hE?_Y%}p;heo1OhMv=MYoHJ==0rshQMIy$ z2?nlF$v>^0Wlw#|*)0fjap`9=G5XXt6Z2Qaj1kh`FvgcqAP2qFjDGV^mJ#SV019&5 z_$;^7Fh>31L^PI?EmHg-Vw;;4(?{cP*R1nvwa5G8GiK#Eg+vu z4f46e>71!V;Tmj(L%N=(0{!PHm)$s@tF=q+e+iIN43;Vtz6Z#szOp%CqjpUrOsJMh<*B>*7XDpgeSfH(auMgRQq4gwxrzXEP2{pYiN zsgQj?GUg9hcS!n0!2jf700n^H1mbMy`hk>8cgr++{<0_`!Ih|I+sVGx~4z{vnl$ z|DVwh61_j@^Ak6otWX4g0a!y4e^POBw^~Zp;qsME<3%Bm3NPMxi%7AsENxTI!AO|( z7-LBfNJ|Sp{SX5Mf;YzY$3U0zKWyQw{NfRjL1d$k@!)>r;U>{#*;8ZH-w3KB2!xsj zcd^Q3Rm#zOIt!)Qa_g|M%*!hui&qQwoWIT(7YH|y&VL$}tGr0NR?cs+^8ToO;Aet4 z#iV&7G6p^>%rXYYT-126O`rqm$`8`^c~h8;sC=_Cjl*o&Ag?QEuMmHLcHlS(l*0*D z8Q8sjTD^eg{q#An>JYlhiKt;aFQ%jx6R)xC2EItp5V7epsV6lE8~NI7J+(`5KeTIH z{4GEdG)o$s?J+OX9bhYVFi7uqiiN5mcvQC93uu&(o5I4tse`yv34(=1h4<}~$#_#u3@L;f9!svilX!tbKC z=1A+{YSu-5nMqZYAiXpwL@$l^45y+j`;vda(YL2wI_%4A{ACJnL6m(KA>p}?_dI4! z>w9uSC5f#`D@wwfJFmIcdbPr9bbSlsQZR3e{Cz+vP_98m%>=XgeYB;0CH+_|$O>Kn z)upJE`SeTD@qz*!5~BmuU4y0Bxo%E7(<&p$apA~CzPYD^qu-D6jSp!B(_*&n_POeg z5#`*SJ#Ln}m7i4~P?AUzUVwV1EfdmWp<@JkDEvvA28)ylM_W()elC9~^?4H2o-hka z=s4xINHnQ!BL)jQx`E*h5HN3%AKjiWGReWk)zknOtZvnKxDN%NIjM@Kt)#kA13M zyV**U)|!I+L}o?(a?Z*yw=W3wr2A$1HhYe$K$+mMe$CwLU9Vi??_!tMBXUgm?&&CN zIbk;YQ)jDWBUwX9{7mkOAfZYq0spo+Y?p~kn|pT~VOGt~Qcfu+u%W&)nMI&GSG<&? z2ucW??7Lp}vzHlA@8KoNm`^v%Lwbc)M#z?y%@&ha;$2zHP$lwn6?)i+&C%q|k&II& zYe@tc>Is0dnM+!l?HnGdpIwkfg{#>t_1bP&*lq*`M=hAOPuQ|lMh0xeHWsaS^HGVn z_jIwXM7vsH=X1mu+YG-YM|uA4JPCUO!7zQIkotMQWCR}p8AiDwTXAzV_93f5J!+aE zt`zZrvD0*SU^eKzJk+{^!h-CP>sFV8_15eDj#5p3*Uw}L1eQc&h9=YDuP7W$sf2b{ZTZ*cGKlbbnL$~r*RTP<9=!+Qa8xR1 zM@pn?)7T`{un^X}u*p@SE@weKqZ+g&(0578tlKSzq*+fZ6;*3(1TwD$1d-gEng+p7 z%D3X6@#X_u9FAi^uwLH2zkPSV%DVZ0ZEAH#Oe}G%8}3Eb$$PFLb2FH{&fcJG&c()W z&qlPAwABrO{JGB|j0;5o14dr#x8r3diIM8Vz>^BtGkBTasJTtgItu13%dX#5Hgf(N z=9^^DFo8*22yzQJd%I`$-Xk@!)Uvv*}X z^s+cr0`R*{NFdECh#i@|y!EvMqmg5GbJf=5=8nR$@jDS@klx+q;I53DIJS}ywUMpX z@4Ic8nDUhLJ+AqUsm}R)cyk_iP-wzM?ATC*pPPNg=Q&*Ny3#3_L4WP8atf->A=hd{ zn{~p|8dIa!Ls95A&OVCcWgogRwk~UU95u%ZuCquQq;D`_!5yHPjL!p|h_4$358B6f zruP#-2I3U0MNY@#>DizzE6TE!Tn^XOG4{Ox!sc?uzp>xeI3|KdP%zqheIXLIfo!>< zF^JEzSgdM4O=BZ=B^>idasWe*cYr@lO6aQL7_EJ$(cS~05n!4eOtRz|V z4HD7&()Kg&)zVZn+2@7^3T^L^g|G6$xG+z$G-JQ)&Tmv#dmHI!zjXuo$*%d?d1|V_ zGuy)v>lUBadKxp&+pFfQ<=bVW>egL)hMf_X8C7fD+DzG%<&q`+mnJA}4)A-gwm@#H zxhULgAAM((A^JLDKbS4PnzbxtnXf%1IY)?HED@W7@}E9Dbu&T)ftSy(H1V~Vd&&;G zV#Uli0wXi7`!{AD3O#01PZ{sJt~6IMOjj~evrx)UjMWh~SRT_!5=-@;Q+EDQA2j*| z8>euguu-<;j0a6pzq0&-_Kj~{sztC1n%aa*jb~6bZAh)4q1oJiuC~$VM{U+-sHZ|) zw#a3>f7BavRZ>XL?ON!b-51;2@Fw&*ypt&I!F&SXbV;Z7pv;xKul^wsy3| zZ;kz1!g2rxqXIhT2Pf8wE58udJOov&QQ(g5kTSj-q^ zocEn^?I2`XtqKrs*Vrq#ec);j*o9lb!n4H$9g}S4#kKOxg!p#s#{4YSOmEF8!|6+o zD0*e>lH!~xl)gF2&N(^pFRe06<8~)0s5C5gjUuK&x#!i<02nQkRng3{b^mKYqU;as zW$$IUl0!MbIGCQMTyc%37+YPde>W`skgkS11T-s#FGohz(R;OEF6bZlb2_zHyptLA^JhOO?nUrQI?QHb)g| zV{si^K|*EbmEsHK<-q``aFa>)h<1{sc4US;6$4CwBM||Ou_zH#gtsSKCqen|1$%?f zVvBxPqO&_g`HR=W$iAdZHfnmzuDh^le8k_hWhmAo=x0ASfZS1fA7sRkE4R?4*lJ?$ zhGtrc)sEQG4yHc!B`bG?ZQ98R5o8acrH5wR`mW{lpVJ6ak-2h!+foj|!1WC#ZFKu$ z`gaT-zy^Z3^w;b$*e~*fx5&7nwEpSw5*$Xgw6%Q$X<^g<)y|c_L)q@}_w=?Nr8-V1 zN*mcmGKv^mFS(3m5=LU;SVs02gT`8UTfCN$twW71OO}W+DKe=rSt6l~F~|~Q&6q*N zocnRwM)N~|K=adF*WA}L&wYQF`@4KT--5wf2UL~#p)?IenCv4b8*afhYIJ!tva#j( z!2M3A%9n7ycfP-vD(8X%uv4y^m*;%tg!Q1Ap_0(c23^x?jZ# z`c?+%u-n!A+i%-~4Rm$Pp<`$-*VY4tcH1wV^P)%;BfvtwtZ&VuF(7(~pNnAC2)Hmn zM>z_1%t~d>23YjIOblFZoLyg77BBt<`^|nlnB?!GldQGliic+NUwAd{;ih;*+}b=@ zUuBOjWfP4Yw<6bi%{~6rlb{(-^qF&I9aNVvxW(Ct8})1TJL1!B#(bqP>1*V^SiYrg zhP9l~z3YSRR-4`TgwH1zpaIF>ED*Du5ECjBG92huEDVKOlS?FZCV;SZo?kkOwX9uL z23siyd_OGXTReU0?HZb$aVd;3vvrFMXJM=6GGbPBB&y7vs}T?-6w_JRz65p;nteqN zv7Ilh&2aDs+}^KEtMiQaQW zwHg=dr0suf8Q;$PYh~tDEW}4(jWgAGjIfDZefzQrDd_3=WfL#TqUm_pg(>#5qhvO9 zih_Ws)P}zCxkk~za3TPc+IWhgeAlv&T!&|`Y zJFviuP4tsJWv!FD6OOI9q@N~M{Dh;UmKsv&E^!F^fZ_)D^q#?OxhR66Qo(VC5B0eV z4N`KI%?Nk^x7_6kbv8)+ra&tWT%tHFySeVA!9n_zR9$xXRq7uEtu(pAVh`|R41xmy z*bGe`0x=`aW9)QDeb1-Re$)~61Jh}c{?{DOl}Lfb*bAsh{;su9X4nsiqo^%$>9lpu zySKZMt&e1y-kHD9EiyfCRy>IAZ#a=iE{@zmY?vi8@*Am=Au$ozT?$DP=t-523Ta_O z_a2V0ZVi+Q#H2awm7PoH*C)gSl3c=u_;dl@R3vN-c?S-8lKq)V;u}tPAMneZX`XPH zJ{DP{B19on?zteF8Z;fl4$SPX8i>~b^?)90VBL$f)R#g?s>S+q zS+0TY`mbEVIcNezS@F+!*1O}TZGQWki(gWdvgOZLHM!dv8ty@?ASU(D0~@#WzxIuR zSxhnJv?4+TC#>?_O+=g~#$-p>9V)$4!Q@jm^>@bh&;fw6Xj`QGW4qwlT9s)NETZ_+ zFhi6(^jFzgCv>PRE!CIjw)wxga3IJ-FT8Jw_sojdkayyc>;35rLhY5CTW$@A##a6El zoPm=XUc+yr#8m&~#rLflOIJ(g?FEKC38AxORf;~By>LBTotlw+5#=FH`*;^gA1fleZU(5J z?gIo3j?eAj*=qK}!^xo9e*{e>R9dneZKQ6umyF4{n0^TvS^kp|sEd$d{# zua<$a8ur;kH)8;FSc9?Pkz0i_vNVvL5=)sMXesIN)X(yelrzVu_eI%M)*Bel9o;cA(X*UokUM*vuT3X z@^!~b62!fRE4}p)wvtXd6MO^W09mBSEL1!(Ues9bdY9gw;og@|@eW;fLP8@?=HF2b zm;rB&k(}&HqcH<@(PNcvJt&^^dgZ;-&jO#=VL!9&dtF#j6Z;ETw`UYCJ>`C*Cta5y zBv{QSF}kP?1KpyIe-&bR(|YX;fvCXoN1{FhRlXWc93|@(jv6P;mtkV8q6j*fswKlt zP`QpA3%6j@`L#LHu|SPo-d*FH=L5+WceNQp>yNq2WBNGsAELpKNrN{4iJNh2u&f`A|`t#l7P)WCb@ z{@(XD-sgF|pY^WadjI;d)-c1&eB$hL&feF)_O*GVt}2IvMTUihgoL9YFRh7$gei%H zgt~@-4&Gt*amPSHx}$0(C8e$)B}J|7>|kzXYleg*|0YrAwyxG!;!J%_1q?h48O041 zyx4o7CRu(vo& z>ORRZ#mUp$4fw$>=AVkOgoZaOLH>kZs$lr*nIiqRL+bsmR1}Fxj%U4G>U|RTUbCoi zY~LvN-msl}_U8!Lz9=vo=vva|>}nMm${^19%oDz-$eo)@Gl`7ULzj!}O=NDy+;wl_ zv^Yb(GgttXCA^pUaOphQCtV(o5V@MOJ7u)BxKAQ&Nm(JhAAJqCB8G6EfSNC$Nwyp# zimFb4EsL0+2l`G~AX=zTjG*tfkNT$nv$e<06BqAQ8{Uf*M+=ac(La(>e*L6aJ&q8q z`f;~L#N1i9L|R?0t7{HF$j+1JEGlnxj+z3?sX0_dh}~mW zd+y9E2QIa$6FDF|1?6)1S=zopHb}SjZIyn87oUAfP&J4j(pwDggaylgLX`->N%&wM zU`2gfe#V>3`)5Wo{Vj}u2d|Jh9m!sW%09w9mjA^|Nf@|Da}Tv=zv)w;5hp{)RS|x9 zMEo6>tulhX2Rm9ND?#*l>%kZ?x1ousilZ-Z<35@8$zT79MS4c_mIn9P)0>XX|ye%ge;7k2qGcApHXjCrWDEp+MZrJ9H22DR`b1A5nc z_G*cQ19Bb$$gYy`n9i$LJ{eSV=+@24flch%Vf|0A`B7Ax$kW3jFS}CaX$HuT^%IFs zF?Ks-_9qC)>hhY@)gwgd0$%bv=NASSUVeyRnlfIO+qL=P*zVaSLc6~1o9K&gODn7Q zQmH!l$McP8aYMZem3{r)(Q1`MKUASW^Ew;shQ?4&GpgU zLydVI>dfcaGjCh1YMwhQJaa=%!|ot`$!E;uO8LzV;sm)ZegfC%STO3j>!XABY?j1+ zqzdVbavKW0*~BLCyAwbq{EPY5KFLrNsu(L7Qo0a}c0cySgUV*SpJ~C!?pZV(Xi1;$ zep$v64#34g>ZE=k9egVWrBEDN`%5r6_`b`?_e8!HOStR=!}B-e-`bRouRgp+$d)gyv2;V z<$#jK5gQ}(UV?!pfxr#_F1~74+qjH}%Ah!67 zrDD({w29!SOMRF-m|LE|lh32YrqRqmRXn4uU&_uKm?~u`Uy(hjrKSbdG`6y|)V1`m z>b92WA-2@E+?je(bRfP=p%S(xzC1E&sW^i*V?X0GV>iPrm|-u4n_V=Dy}`Uue;j{& zdEA()tYV?kUMTOYNwg<3qsER`?v2WlZxmOvXZlACW+f*K1-Z@ zQIV$SJEOc?Ju@JW%-(;Db&T-LP$0sn=wYBX%rQIi{1VOMY{G)i2+Z&M}yQuE^Axo3G|Ichn1Iqc%`g@JGN zH5c3q&V1o;NA~_L(r_F_oYVUT{2mT{e0TT;c@6A-Sb7X&@H=xv@TR_~>)EK&7-pE) z5X}q8Qs=GJ$81XC1m)^iKH$uH2HPCDc~ zB)}e|lRf2AMO@{+K)HanaJ%(p-|6t{$@KD(ZR@Vb!CA&K)w$jomQUlQ+eI~e6y9(S zxk$UjzAA%Lq3fcDqqw07uga*E{v#J^MbovPF z@kB|`!YAJ}_p~qZE)}J;r`T|1nVp+geA^m29cCNei`{>e7?&*YLO|bTVMTU*Z)kT| zX=pZCZAficjAbHrO;JsSJA= z7My8*9X&N$PO>vN{iv<&l`~jS-nz5w{Ob^VF$&(WGQim<}7&h7I|C}amxuzjs327#YDHy&1y7n3KD3o<`sPUX|HGpIi>HFH}%Dg9LH zoA#EUb@tf2*77Kk63o;%^5M-8{t`ZV;(;=~@; z%+rSLG5gKEq12{St@_ORdhfTe@^Y!|1jxMi*s0OA=*DHCSJB1prS4__X)wGW-T{xf z?s|uce_IdN`R?S6C49M1YEY^GN4~2(bm7a+m!ZOh!lr)$PkQ%OdnOs>0S8GhXW}-SJbZJ(<>G6iyd&DKXsi!_LE$LhIg@ ztF5r}E|MoCWb9rBww5jqLdUh2!(lTbGvsWZM%K+ckdGfteR@ZjuXR)naGJf=$ZlqO zwl$`N>>n*OLAZC#*I7p@EE*2%hmq(YEW+)#`e(E5hmgcEEM)Kc756DD8BFH2!i639@3JfY9u0zGR zqAwSfx=VkOnH;=ZzENSv-n_gawz?dKdR1P5@^g`U2~7)GDX z6*XJ22=Lb$w-~LuqMSxaEF~wMzQztF4aRTp^Qiy{MueHJg1L$c5)*ihfpiO*3<(Xq zLIw{}Wb%K$mPKYjLjB`?6eOfDE2LY0en%BNBYtAQ198kho>AjMk#2*(?tzC#Hp*Y$ z#+1xP{p&UA8u$!JLQ6_P0X%D&I-8l}s@s8dThIGa)Pv2n0*JQT&ErluBleq}DGDJ}cw;ovWkhnB9cj)Ls$ z?(Xhv?%Zq+&KB&P0s;c;99-;NT&&<5tS+ARt}i@T?Oka9agx8zBW>nl>TKocYUN;0 zjX3X%mktnDk%tcv7y9SVKkn1a!|GpGvUmA&SYUwch)>u#**Mt$d2Vp1FydW7bt?}u z+vn0&cED!f8lnOMPlW&Y{(t%8UswFMBX$3EBo{Y7$G;!?Zy)`?4}I!l<}Br42d?QV z`mg!=^WcB~@XrH<*%3qkH>>&VRfM3@wT!%>K_w6UCxZ4>bVu_`ph9O&dIeRfhOM zJ_5fO{_zZ6BR^cUix~k+K^#dzT0+|cd1oH;>gm|^&4F6{D+X>l(*#We2rWOpPBV?4 z42?!%Q=vWs=IdOp3A&NDxp9R3!ms)gB;%zaT1*NhJPZOi@aO(Evt#F(F71m@fkl`0 z^?J(az0Ao4@7?v8#a&U#b+~Ss@*tz61QPNsJn{eCFK6BCI}*hY_C!*nkWetF1O8XP zWCN%P*-`%2U&mN!MUp5^he}fZ@9yc3d+(ovOc`>L{%ahHVV|zfkIb&l4hoiEcwqg_ z2sQDjF%nUTpW^>%hd^86?|2ERMJ#3dF>1YFh<9JiZ9etY*wC@Cc1K0ICa!jVv(vt|E6 zflrm`up293N}fOYA&wMyk5K+N;~~b=pp34HzozBCU#N=8)Px~RVYT1bz@5&WDLed<=)3wsYY5X5#IBmr1q8I>_QtO zgyK`=TvTPr_}lS_uie6sG#g44q6rX>!ocI`i;c=n`M($bpX~jA$kd;*Q1lyYW0Q@% z6iT&A9)3&Zjdxz{H%r7p4lduiOBO8t1+Bq;p2?vD-tfSCr&_bk>*Vd67;)rR{zV^J zY+{y}i<9kv&f8Q5KYhUs`B7UrSs46-E`+YFnh5@v1L|CblqDmK_UE<+bUb`{+_2`5KFE zs`yhzJPs-0xbbO*e4*;kQ`@z0aV}r)9wME-zF1l*weegTY|qt19m&gSjT#hYoyvq4 zU7=rtaYY#}-jQ*ezcuhV=&^E0xNa=-zqzjQI_2lG{7NJayvWs-D9y ze!o?a0!P_^ld6L`b{|zduay;rA8Vy>nMDX6bQ9q7O0#ubIp61eDZNtta|js!+41e! z;+jRjONA<TV+R7w(G@aQRKo=QI=oa)3Rq@!>Ifi29nufBASJM zpJ4MmZ{E#1bQtcnKt+wey&<4i+@7rPy;O9joeq%Tf=Y8i6c3*(b+z%X6w|_Ztj8Hx zqibJuq2Cww_<$MntnAshHaRX|A@5V$&4o5^1_rMgtD+SAu(`-;&0G|2^T8N7Dqn4$ zxi!|!>nQuh_W6*E(7P1Y_TJ0y`1>i`=V%OD+!Bj2z4~`rwkul({pH$rM^9dLXmuX_ zko~QX70>3u#tUKk)|S>?3*R=riJLO^7pWM*?Xg%sdYo}xZZ?p#uw`XD zrLG?TSj7j%7)706d+A- zdi;glL4JKG!|d1R*K4}X{-0pH)UV$a<2$?ZB(NDa=Q}1IKU1MIi5V1%Rn9mT)!1xn zMo*}0*%SN-TktNBi>B-E#)kDFM&0U-yumck?HG9y^R^P4c5?7pq3??OQY$a zuqw1F5)Jw7CPcoaSPR_r=v<3}@T>~|{3BHKJK^QiRmNf;PbT$gqo*@Et}n_qFc{rZ zxmy<5IxhEwN;L99=Lh>VrlbGYdX^ zwmVb*?U(CK096&vo6`<(%G^9XPbKZ90dL{4?Gp4mah~*XCGnfM z*O!YosR229GH5xj{<-aNu-&DLZl4ebsT;kEdp`GXCG$b_qww#1WSY_Q^TaNQM zii6mPK1%e$GDXb4Ju(j(+vr!i$Ow}x|0fLiE3E$~$Wo6$!;nl|8d6ys;n~O^Yn!Mt zHa7p3x+)}7R6j;#1hxQl7CKMLXZzlDYg~b+E-&xmR;22D!?5V(E%zcwOWmy;+eMG+ zduTazVCDLL>wB_!g}gXhs6BOawv3x1>{+v+t^TxB8y|<3$A0d~#6W(%Unz31v;?W0 zEf-|;LcL&T>B6o=zp-+o5281UA2Mq4*8LVkkb1#3onEtR$xi*0QSRbX)C>`T$%by6Et+uE>Fp2F|LeUD1{O!BGL#n6Khe8hSWt_Mxyf5y+>;A{|AY8RAD$mBzYO--E{`B$u_{enK zx84UXZ|`@0nXFaH$kn_-Qz}qQ9K0;QJb_ByR2X*{-4#7E5de|MP|JIP#MtM>+heHa zA~FsN5zoW)%%Z%ljm3A@tk~I>f;Tk9eYw1Lh|6vYenBzrki~zi;4;hq+SC1* z>vCx8TH2SK$m>8g_*ophXjiRZqooA?fW_J#Rd67QH6D0%!zkfH`J=m={IWNdCl36& z$KJKTLqxW_Tu?+aoeCB7E*Z*o%wDwHj9hlWPiJ%UhAEu7@eiZ@uV{aHm#=5|o=$T? zwzLf)O3gv)roB;#Zo4xdVIzsx@Jppe`}sNFPW-&%Px%V*D>v8B8|92Yw%YT*+iE=6 zPmqzd(qnyJUPQ+kqYOn?rbfLHkVyJ5rY2@L37Ned#>9}xt+N{YGVbEH5)E#`qTOZZ zTosS$aBk86M&$GpDz0>9-nw8;$a{j+H^MsZf-k~#Yt6q&t_#tm|8N-=A(>eOCBQx# z0kP8FbgDe5lX@IgyoWV7Abfiiw~MMdk<9W>Y9_1V^}Ca|Rf)Pdb~KKDevK`)TR(NR z--T;BgbU}Swpt)kQ2%aRcn8{nYyo2lezZY+e%QElZ_EtKBuMbK8s1O3%FyUIgj0O zCcjxf;qvM{iNieRMN@QKu|r0TZZRtuyhz8Ri{UoU2WM{D0|lfzryUI>jqO$8$w zrY_$$i&}oCUaZ&yTUVW8yMRawu6oWk1d_Jj#+w3dak(H)x&_>;(%qfI^hemRgYf z`Qc_GWaPO@l=ZP~c;##SH@oy$CuNR-Js4X(Zrf}p#DSxy;d!v=g4YmX45tN8+K&wP zh0VPw*zA3QFPSfs{t@EuO=yTY^hu-og7o5zl`cDJ)ToV>Wk2yrG?vQ7N$b?H-Vcby ztHwU}F3Kry0b1H={gi@h!(`Gr?L1LOVX_Q z#%3b>W z6;c)OR^cm$E*+Er?(n8m`d_22 zG@MyWJVhk{hw{Sm@ec30zM6KYiNSMx0#A710bSi|nuvfsU#m=@g{RyW65*jyf z7ZwDY?d2+*3cp6)f!qpKC+i=HRcxI)5Y2gx7FOB)rBOu z^-koOI$nQ6XNu%-g~RYzLDW^g#A_2v`DnDNGUywafuTQeK)bAxQ{RtHC`MpiNz9c^ z4_$G{Luw$c*9(^dV`!=3ui4rzA%=2_hr|?^U-jjr>>4)Ybrwuvb1u;$ul_mP2yE+X z=XElnGtVcW<$E4wHG2ZzC&pp+%=%hsLuJX)m!itCru^dmb2+*F(GDq`c$s%i`GF4| z4x%pdrgUud(-}J?Vc#8JI1%G~`LSNBbP~y=LEAg+5~KuQ_;GIsw_+P-<%d_Z@_6GL z06*U;pJpvoG4zm!L~`BPBj>eF_&C4+t4j)Q9<1IVV%Y48If2=1lg;IUJ3pzuhRxoJ zH+FMZ8m%;)@xiX$!{n4ZlAbrAJGGApLPnwWtR;R$JDx1RCrR)azMKJhz(jaY;&xa zk}b)MNutoqUP$wNlRg2Y~KN7rBERp7X79KURazM;=_gvQz8 zFOAd zM~vqpWDKf(34DlgLeYXH$+G3ObA!~$FpiSI`r>%i2qIjg;<%;i!ZH~fW&;ekYPaF3 z#O#VmwZ|9*TF#zI96U=D-{iXVJKZ*pypP06k#jm(9hr7Kz9qN8?}n+$C=~K+nmQeK%qf zoUza!MONnr_kvY^Mb8P6z;W>CF;bS>!$fB+R7o?xyKPC@X}Z9;Tc^nd){W3{vzo|B z!X^6?!C}w!42!g6j_*s)Se7Do$)QJHUF=H9Y`iqYB75^x(1x$Ka&jdt$-F+$S$m1t zg|T*SZq(TU7}zFao76)Tf`yLR3e_AJ%BEp=pt>#h>L7Gp6NmH>mcV8(eQE8>o(Nq4 zqrY8NjB$F&&5Q=TiTXqgHT>|Y6DCNSKV4@P4<++zL##uwqXoICU%g4H`U`g6$=-$@iNZFLl{tC$&U<0;;>#2_qK;J@7o(}fXfbKMV# zxS#~C@shQ+h%FhGXJD)PK<{lg1#2Rub~qV!BvfAz_cbObMSw0!uE ze_^A=6d0{|B07ctI>Af z%RcG^Re>hoOE=hF56H*2(Yn~+moRr|j#j?sY}xh#enH*K#~>44HU2;$7^m2JJpu2y zI*h!0@E4tthw~lA{RAy1s+8!CfJ_^9r(kDC)e@z~27pFRa&%e_I7N`%8QxcFc-EeM zkDlC1KJ_iURD(W$FNs&mVSVl5%GB|bzh8(cW3nA2F;Wm)2HohQi;PhhhTo%0j087i zKpeD%5eR4D%mP#Z&=w|~!NI?7g0VV?4p?YwqD)F`%lgS9-P|B9sn9!b?0$Tpacyr< zKowhvW56gFdpm7$zy|VAeyG_}B|W+28}GnJxCq;DGF@?b?Embe9tVVkO`&ORBVy$k zZ3AS=M|AtX~lsdge%u&(}ZC6)iK>c+5(h|}J<^d6$*4VdUT$8{zQU7_TURntkE zt$~XZVI&>@j{hoXbuG=|F8u5g8XsXvTFnE`Bv@-Pd}{P~&xO}+(o6Z%_NKIj~X*2cYi0eoEljql=#}3AF0f$LL8A4$o{Z9V>fLUri4wlq+ zsr=4pj%mx$V|g&}S%8z)tFy#!Hs?scmrktgPd%xTUdLlK`pC5%J*yS0pM;F1VRGS- z5&ql=K@Vsw!}oiTcb|7^eV6S_wegrd>@wObotD3RQRuR5Ak+c=b8TxRFKiZf7!1jF zBv-~fpTWB-C@BR)5`M8=Wq;Y~SJir`_-c&jiMgE7DAqr=4_0Zd>*lDsH#wi}hK)_b zt(;7d7OU&?5?Wv*R>Y|x1{{YdiE35n1cNH_}Xq;+hyT!TubeB zW!@zglg5qjXl&Z()1W{}98u3m)%8tZS?FTQq|%AdY8&77-)*L6h|N@Q5${t8a8fFj zp2gsMd{0?GSNMu`pYF{I)hrxuj`^PKL^m$_!ZND-v_){pJR{M?cg~xS(Gw{B&R#5X ziui%Sd#FXfBjtRkP5*>9`TA_HEzIx+a7eyW7l1M`ohZ|Z-Xt_@ASAS`hD4Z_KCybT{m5Wgu8hnk{P2QsX^r8I$w_qcqBfwBKt8un;7;dLGN)a4E*6IdwKbP zEGLHIp)`l~%~U;oU#P9Tx?D77R46e~1a_2b;naSTN*dqmy*yeE&w*CYBT2}yVI5uR z4WhQ-{iJa5-sK&)`&XAP%p}e-!JMZvZY5e0)+3BhAR$cfNz0Ldw-u(zZXp6%G($IY z-q5z3Bc9_Tvza<8^HND6K{W~AvJ<7*aCK}FwmZ|e7HNI=OwP}iqovm89lLPatpS0L zGvNu+0bj@{2+yH7C%&GDtO>?8gyS2%BUCc*^EjjPhPV>AWcN!zuC&Kb|989j-`>kk z0QFj?0*bUj)7>A{T{t=$bp{%AXI&7Ex>v*Z0bdKfZFsr72U1o$B6rV;r#k0bcPWK` zh!Cf{hh5Ll#BGN2FBRWWqr)tww^BFBAGn#X%HBLgpAV$)R*k%Z7M%L9zQh+SN0Ew zi6&QWN1rrUQqy=Cbg7d)8w+?FC0>9Wmm?MS#WjHgX94iYQzWgSFpBRN8lCqXga8vs z)*Nay<}K!X`qS4{zp7%_XXy?7%WrA?{1BdvLR!P=IK@xYDeez7GCZIa=rQAuw+tsN zMqti+a~}(*HS!d#C@5$KiQz(+sl{Q9(ywcd9VU}Jw9G~KKcRBMqi%KSdWqq;}?mJfSM|Y*8V(z!?BQKic?i55+ZZV1}!<23*bO^x6P8i z{#ijG;LS*H%pl`>LslWXyhu3y9+Z&5X z?a5@)u@HUYWag1+Ua>$|Tr|fhz;ij)3!-d|#gayqC(HC~)%{7vbqZlk$ z_z@vl?RsGuyD;*(sl_ z)=o|Dce!j6iZ_=ppauR_g7rnPj^T?&3SK(9n#GdbhS;OONzVJ|fb0G^uHQ30iQseK zBNNwWTZZUzPKbhSNfqr~Po>=bP*`dwWAkzEVsXjLj(}jBzC>QHrHND*O+&lk%=f7I z$;$@o2`0BO75*NcZc{n8n&F&ZWj~MoPYvKN42m#oWu&xEb;mWOp2)Di*~5n{dR)5J z(Xx_#kI<2Wm1&HR8!ou)Z19ua+TT-iGelK*Gn4ZxRC>6Imf#&Zf8{5>jmurune~Gp zf*IHaRq8Fb zH!zGeXc)W=l24)Z)-zA+TG0279WF=YsBlhV(H}aNS@GRB0z84pS%_BFiY&<%I#nXZT6EJ)!?(Av%F4fgN;Ibvw$`kZ2%O! z3-Z1h5d2SKg@2TB0&R&H@^D=I9OJ!Cwl*cxN0+Mh_UuyF45BYit05m6DsUzc-n$>i2deA0=GC`|NVoyBcqm-a`NxZ}ln#A*VNHW%{wb#*p{2K!^cQyD-3#sKn>BMJU%zaV6Ezx4 z8PzymR2ABN)Jx^xdRgHnYe8U<&w2!(G(Fc&FR}P`Sq@FT7TNR<>rZB@!2x?F zzc}g}vPmXeC)xUY7UQL#Gxsuk-TMnn(EjG`bONXolsd6cWJtcpmb<2$|98{;A~p&qDFP zG;90Gfb8gC(Ur>XLn#o!fP9y@{q=Bl;N94V7hNm;%vEn~3da&R30?I-1@tk<9HaI5 zXYb;1$g4%;bt^>M)adkxJueB`#t&io5tjFNAT%FbD{g#!p+ME-;&`(Lu+WlXQ(&XB z0+9ym@%g4GA|c=c7~LfeBzFVxYNCuq>CO=e?8XaiCxF*q1?gIR%U<(=BK5PfNx^^r-=r$#EhSn>uPL7f{Uqa0po8R z?Y{9TD!=u2?DjPf#hoOh*7wxU=w`obJ_C*v>+_m!e%|bB^~H}a z79+7W&Lu$MRj`m-fw58qq-9q-?wMr2T_4#dz`>(kIuat^dU(tARm$^P^Eaygjjo%T z)2*Jh+TflMwbf}5XjVaihWo~C9x@ePTkB7P9J~Xi)e9GI$KKEAhw%rj%l)>f91g}o zRPb38=<{rlQP4f-;ljcw0yLl8ndqC}$o_tvywU~RE41??1q#Ni4y6BIFZ_ulq*SXY zYC{YQ!KnnugW7`69Yk$=QNBH1l$Z}sJo^?}ZoAeDz_4oxCS^^$f5W}d(E_m|ThmVz zNp-rh+jB&~d1zCUa{LrZDWH(4#p^Q-{OPWUnx=;#xq6_~nMeDt&GKJgd9r|W9sUTA z`={v@UxP)iIr2{U)^Be^oCZiWi0(y={gv_u+7bW(nVpEF_h0Y)_lt&$Q5=scer68) zuaD*Pzy*1hoUMLKYX+7hB$X$I(x`vIjldP4N~BwtX8rwfqm2KMb`~6QdNP5pQuF>( z+ev?raqRRV5DA5jMg6<2Z0pdy6mQCU}U@ixy zG=tPd@BU*h@A(!16^qHbqs{N{>>dFITF%+8^6rZua$0aQ3iLQceGH}zh>vZwQV>vFew|(Z7?`vGz`)eo8hB5jFl*`dFM8n^&);do)`hJc~ffF zRDpM(MY(my8C>}B|Gn`4O_`V_&hIy;8z8Owqx|Gch?aY!lR@tI z6^QyCk@MT%$&=uGkWCgyh23p;+lbB<@jgA?9mC1Mf|iz5#sLsD>CR#( z!H|(u1tc*w9tSVa;jqO5gOASbmo+w_q5%O{SU3V-+Wm6h%D_z%4Zh^)65qJb>@$!Vqr3aec(~)}eF&R6Em^ zC;H9r-9j7~T|yI=@%fBRRU*JCbMDIsr7qAGFVB)Vn1b1KKKm6m@Z}wOK~u1Lo?LX2 z5_{XYyUibN&{-QS*eJIL-(;zyZ=I>GYFHc<{~lW@03DoSD$@}W3A+&suk|>_e3Qj+ z=1YGh9E!I;UdY)+kc7-M*rj1Aun<;ETDfC{5rD0P~es?eu*SszNAuC*BW1MO3KmycKI>$pvuC4Uz{=sa=A`9e;0 z0KPe1&^taWt7>Oj9Y{_lX4QMYUAzhepi=A86hutgF+jnlpSuNghX^6}%d#343FJ%I zYKER>nilA!7%TlF?r;AQ_o?@8aDNK*%>LI%{U7%GWDJP(>zcmidO-Ob0pgFqdJ5S8 z5wOmk-RMRsDG@c{gAi9BPziCgZKDu!>jP_@9rgCz1i*04&bONZA+}7g4v_Jdw)g`q z=?t_6EwcyDTnU*RK3VlAFpgFl3ULc2!XDrlS&<`HNvM5oC33MGrDt~?jqCm}1z9K) z5YKs^0j}=&e1q4%=@ zM0Cm0@jSI}+2xxC?2Xx6lT%Ji6fBc$ z^1h&Jv2v!c0HO70c6?}@t9CJsRA_iqcK&K%ntdEHD*Af}s5>Wae(1Uz4+LI13!du~ zwf=+eIh_xBNLnB~9OL;41X@#je^n<=_q0b%CaeQ4A2U6;A*=%_fzr~>_0Aa~zo zbDu6!quLxTh{TjHvG}l24bsrB$hRL*0~RaI2Q2PFDqfDlKz^Kp9`ZnD57ve;X5drW z+6{odJ%a6%d+A1~0rGE6mwk*00>zT32w)0VL86XP@7NiHrU}ya@7LOH8wvInnVyB- z5uw=#t+~5?(Ms(vn)#AEwoQ5Yg~MV0>oFY8X}I7?hP&(Id8!8zTa)?@iGJ{l6s-l1 zWgM%8){i2F3x1b&0=so%gQp!ga0l+VL=oLKFUZK|r7dPdHPC;Wr;5~%Se+wU89=Fm zpwL0Tr9KLZMvN3r{rU}dvumojB)q)o=#e`ZYFzUrov(34A2Ad2|8!VqNl6nY2TaQ*!EHXLsBn^SGtf6#c z+}|Ptk~Do6z16iSv?z()c#n)7__Mc21;9=x4AZn=MR;EXB>@ys2jZF~$?5MCrIkkD z_*r&uf`ZgRS3$hU>5n{l)+|1|)N~*c-s5#!^mRW!U(cG40j04yTF`Ui9K9W(N~_c2 zX6J4UY|^;kHJ~iFp1D#!n98dZCA9aX7J+GMK(9b^SUtsskKkdvYQ{RNTQ^;cT2U(b z6UXd%9;LQS^kSazRUErb2^>Z%-LLvV67&?TigNF2=sXNy9iSg&6jbxHWQs7aHu9ne zi=XuZXDssVEmi?ChtNr;$UZRF03b^2_^5pLB}N#J0rdEz00tB^| zDi|?_!#_t&EBkhTzDuYgof4>Gcs=$_5D>^`YJ|!G^nj?lXQ8LJ%0Tma%d|_D=2-#5 z7Y7_o(`DT|;GLWsA*+axWj#^)704!7AzR}I6xiq`$JwFD=EIr0W$P5@Or^M!RmSW< zcz&8eP@mHJ2m-Tm8bTaj@x==h05^tTcYmG&u&53~z2rlt--;kH zR&o#%zIR+46UFBaQXuo@k4u)#=(!*Wm=@-G121FtIRvwPbFCG|pbW%0Z9qg! zm;2Afo9jkIk4_HpR=6Y?7E@G>+pZp>vb6#>Bp%M5<%rcH0y=;mm;L!%4vAtAF|}o* zDR*0_7zcmTQhBz~AsK}5S-aZF1N4ksS<~jF z2m&qrjMaaeJF7tegJ8UVm_Q1K1<`8T(Pu%KEI^bn%@@YY)iqTE4M96Q#tkrD2|D;GeBmxrJ17 z&5$y{NR5m$TLUSNGpOkG(`WfCwN%;Pp%5GoX_mbGLt_aMJ6f?V!QDT?`ze%&6yEhW zqe4&`11-J)S2&Nzki6CdZE+6$n?+faGgS3mK-ihb0hW;-4`IRBAgD>d1K5+lwhXlD zIJ6#w@;EHad6SPY?(r0BSLpFB`ptWQ7EcvN%k>O*zFyI*eW`O0XlX$b#YIyAnoClT za6thEI!B(a_lDNvllJmG@>ra%+p^U9j|$5%n}k1T6-y`5-BGX`=Utn2xL7 zXW*~7o6>e!@Idz`T820Q;GM()yh1y4#QQBqpjJZ=9-h57plxk5N9y(n$Gb*L6FE7^ z3}{PkYmwUi3;gdijx8W$;W)ycYPxlnu@C&te&xb!=bP&{>RcD%2FE?buJ(g5pKOl% z3bM|bMvFWKE(GhHb-~zV+x}bT8N*E28u2Xjp9Vt=CCo2mD*b5daRu%Jl1d z@*i@wF0HZ$lK#v@O!>{aLUBM@fyiB7$5rri?U3NWJPA?AXv|Xy8Wz}!m=2%A0?X|O zjsZe#S$p{G>l{Xc3Wv zptoHe4zy45*-R<}ImB}WKksu1k&-iL2CXg9!RmUTfkH$8AgXW8svjFcO7b28y|F}+ zM^*I7GQm*XR!M&{GB;o(SCr}v^6%DUJi9Rk`iw^)md1esnY?}T=3|0CBE8RvYaS3B zI(MRx^Z_Bib*5CXRixISNR^ z2BAB`o$2f7j8_=7dDS@nl63Fj9|TQb4k@QI!Jvi0HKe>hk$JXjPqXNLxlvnlOU7V7 zYvbHYVcU5`6UWe#LygQ9!IDB1cn6O`)9Tr& zCqIzC&#FAGzGgk`AIY8IOe+`SG0$JbXkuG{a{PF1^jzFs6?bLgMXP91z7dPavO&2 zkL#J-wEUYu%cjwKG-&cm7pcXu(JKa`o4g3e#Ofq zGQ!dC@%%CQw;!uz+HL`kc}B+{_>ttj`4&XajL$FXwgw#5b5#ZtDLLy=ym9JxR6ohj` z%FqYwry>ATnoGG{e*>lvRMbe+Bwk!-O8fa5`H`9&>1b>M4-#Z0R<0kDKK&;)s*FgX z(K5(T`SS=i2^|q>zyr__WIq8${U+2Gv5CVd+;rz`fO<iu$)2vQ|AY;`Pd7dJm1!rvIr#p;V zMXHIl!=n0+mEThXI>B#dwi74fvG}KKOTYAYNGO{mAO`xG?)}~dG2VcYaIP9Y%Fy<2w^NGf?MYjCTV6U;`9zpnn!{elr{5s)*J0#=gHp)(>RpnjL<=u+A3O z&4-AlWzc}veCa9^ECXnWBt(`92f0_dB|~X`L}(^hHY+W@fYw5^4(I-=1VY>I?r-1i z5P}_^0hcVp6aUE$qNV@y523$Z3jqN@=MKu2<{6(r3}Yt?7_~aIEdkyY;l)6f($NQ& z#;(29TvIisR0il`-fNH3Lu4u|pb6o4{yKuWGUoqb@4e%>?%V(2h$1NxMUhB0Wt5p2 zvUfI>O}5OBS&|f)*^yZ`nOSMrJF->wEHi}rcsb9m>%Okv`Q>r{?!WHG@BZ%`#pm-L z$Llzr>wxMc4fPAN*;^+M7ofM7z3>PY&i08B{=aCvJ@14D;Q8u4M(RgqzvPJ-x5S!B zsCFoYcV}wS=4r?}l)YtAdnNURf*U#2jicm_UjnHlGPkP&It5Q7bz0m#tt;F?MHf(; zu-o}4euMY??|x{nOHYQTrg!p*H}I$x^HG)PPAG@*`!-y)M^Dn>r2UT*^*{eL*b|bo967b&#aiXEn(ioyXJ5$I>@__ey_}nMX zDD8b*QSacmi==~f$bb9g?zi(_20G^xia1$%GRi-7=N9~ZX+`R3{*-3utK{*E0svI_ z&rknf|EkD=ZACQgL%>FjqsD=$)&(3(2Gm$aMl@4$0yTM=gvfU|wS{47^vEseI5yNF~MWTRnEz+6nC70oc zJ*pmb8osM_f*tRJ6Ne06fH8IOCwRmbOO{;zb$9;pt9qBfCH>Z61OIbT4UqB5Re{zg z!t;1QZp2k4rR@EEq7b>kYcYTS05&fTycv@4i)sGUr?D zgs(46#0Wy;@%*gI=rfT2;ObldUhv!rMYt{!h7WId3Qq9rRo$k(JhW5#6(gU5eT5rtji6zex^Feb)F9M^PV zTO}|Vb<8<$zs)azi*tFhy>@UlyDL*u1+{St*=-vju}Oxn1b%?isC^atw1jzBNefPf zXTWRnhTI02Iuzmhyc4msfR%;;&rz3=7S9wUu;@PkQsjBwjOD!eLU7K?e14@iX6?Bd z6YKo#5dumx)U%@9DpZH6f`qoeymfM21oey&oK1E)BsvH;hj5we%NU^5K8-ohu4cn;}5vyVT}Ljjs_jIaZYA zw|x~|DG3FaXL{K4;}0cpy-n(I2Enxv^k}Omj9n0O2DlDlN0VM21#HQU!){bcrRV0) z)+b;xX}zDqHQ>;tBw4BkBlO|+BGa~yXSxv&fH*{XzNrhv*-l1)JYO8;xb+SP{kZEh zD%&U8h^v&WJc9b_6x zN(=)EcCfrLlU*}k91;Px6%cI74}Q~oCIFdTKgd)Ak}{1_02!LC2*u(YyEQMCe9KC^ zm7yTC)kQfanWkUh9d00aR{`!1LNxe%!u{?b!B}6mGiqQva&L)|9O?vfmh1I(83XVm z8h5c-6Uj@iv_C6>hHCldbrZSa!-OFSAg# zI0x1FT!ZCp$yjdcI))?EY%r_StQqF$!ERDQ4PeuHi`mV^#5kbeEhndxPP_8>=5iF2 z0TYP>%BLFivH9#&p#L0r7P2e_ICvADED)pXuWuX7fl@tqyg8=a?K|YRhypNf2wkg{ zu5SC(io+P1!+~hUDM|t+sBPt{>53v;!NpJ->R$0F ze?f!uq%r6pf-+U2@1>r&)Mj?baawe9?UkST6zi(B@>&W7AItR0W7ofkA zQIwTI@XGZxc{C2qLdjA3ym&7`OK7D_rE=mXr=y%IOp+I_4%ql+fQft>c-FMHP&XGG}1092T$p6g!#Y683;}4}Vc;de&AT=|PuN8h2P&+Yf^G`-*Ai<$|4?T}$7 zEX9$uJfem-o7)+dLW}ptYLJgaqFpwa*N|&(Z|HB!}cz9af&*S~`T3o}nht@4MfJxRU1->acr zjlcZ-+kK_m8##p$@D;C(Yxr%WlpaRGMUbl84gw|kWGoyp)DCwm5=aUcKW}{x5~7*m zRq0xv=2c04FwNWfW_|j7zs)VrF1j<4%3J7d|O>0~=nSS#Hz?(ZQ zlj4cr`?gQ%IiGi6Ew9Cw|L{%sX3heR*ln;B7kqB!(A!EXtabABe*(X+EXb< zziSEe1dWgR($u{Rs_NX)ANM@gfUz_7eoIc=z|R^^a?j=U&Z*K_1{wL_`nabkSq8#m+*K+-Gsa>2 zO&s-My=?RMocm8&WBwM3JUj;2<5mX*AGK{K&9<45IhI|ww)Ig=S_mngTz|L9P}{;GLa?0|O@-i6J$23d;|f_={N-t)nyz$9NIC3*h{ z8Rb_~P}tQBKInUT>i%?3PQqEI0o!k~r5E{V0y!VVFKi$?+smylO2M-=koD%k#l+8{ zk~HFXb@_IGN^8bTGBbR?=MKXt_75b2-D)aanJ+gjc>yYY@?h}k5k516(EB$`8Mfu>d|F@SLv!niV8ks4lTYdO^AW~|vU)F)TImUxiQ zL3)3&Qm94MbrJBdUw!IYSD-qB(W|G-+9$XIgfBORd_43r2a) z79KGq;{fCsGuq`o1>RU-S(LMNc#Cg5@-e8PR<*%ljq8w?bXz2gyC_@BuLZJ@iy-+} zriw^cy6?bL0rINy6^B9wO74b;q@r3q`Y0nPMTkL9e8ndn{#0EpmW~OJbYWv7+k6X% zV?I0c53*oRhpHGQ=vLG(i%Ij@`cBpKHQeyvp}upVW#m?%aJTji%ITywKi8M!DFm-w zQOvW(Bt6fOYHzc<`i3uiLCQx?jNX`MA8knf;6(q3L_jSPM{f+Dd9#uvB`MAEr}8IX z1;2ah3{MR4GMrih{f_iCP4wM z9?>)cYai%`RNPa7&q~u7ep8PTGCbeLIt{tZvvTLYyhW%@0S1=@%xcu&QP#&4!bq73 z?M9yyvZa2+!Mx>v8Se;k3*}-fU6ri#Ie&_%1~*S#eP|@{A3dQG)MBex?ijh{0>Lc1 zZumJggS$g{gP^{`wv+dU8lZ^|0aZ;)-~r<`NqBerL@<%i{}YO;W$S3Zg1T+BT_lSVDVcn@_D@v;t8^37vlTMivTAecw1}(z2`6{UQ67{n6DED^ZA92XOB*h z#vOYNz{(;hpJNz*Sz;)?C2HA$pBBbaowh zH=}8ImNIY>5oFTtrIBljCta)fvjOEA0w&`&%D?(2Gcip<(8>02VhABqhHa8T>@{H< z@pMG%B|fC0a4pgqer8!+eMVU3Wajd=;DsY{npOXs2ah$}`ITM}+7gAt0)QS1QziPk z4D-}o2=IXNm!Ct9U=K*}t%s55I~>4VDmAPcxx%3I8GD^XyJI;}(mpH06@F9cpcEVd z$8fzuD2b6KdcM&0`dvvT5r#KPFAZ*RJ$cdH^CQZy?QO~n8(G^!i3ryB(HL@@DLYgS zpgqyRT>lUxr9}C#N4?Osp{O;K;9eYLkwm3*a##bz<%4Nj&?=5j+G~lV)xVvCIy0Cq z{KAd6ReHBg5tO@yTqLrnO^M)+8Q-r>QAgeGl&S?EwsNkiGfDtHHqU0`2Gs&FQ0vnt zortC(a5osVm+SM-n*0S_-vJ!OPC_paLsAjyL}trBlAFYy=hIDyfCNAVB#0ljU66q5 zP($RWR}RfB2L#tH+`Ry$#8B-xPi3V4wD44XupgB`}KDi`$-_vTKK4u1C!lj1}M?L};C8@SF z$Gkf`=3GV!eRX_vJin6YJiSl-?O+MMD#GCSd#!4R6*Riy{)I1;bbP`W%4XQw&z*BD zOhse1pw55@19H=8IJ{4S>c?8rKiK^v^pNYf2dWUcpe+*JnmQ{4-rC>+M&Htg~k^8R>IBxTzlz65cZ7kx2_*iXf(y zr~^?n$aS@Yj))@tjDy4Lbmdr)bi^zpNsv<^JGkHVeZR7}HL-23EwF=_C=$dD>ZoRv z^V?0$5fH8dPJb<5FMC0j&ft>6q+%(|tGl+_9y6iw3xoxIXQjttRCb5Pbdy`> zU}$`chQ<(UR@Q%HApU)FQFCIuklxn<92L4>@$uG#h0Z6@+08d=dfOC(4xOcP871b# zZh~LKPGOmA1np_1LD(RQb7%xm)#PfZp1C&kVGD{|pyWldz+#(@k}PB-7&5|ZN2vsO zIHdlJMsC|sy)Ci1=8v@oTwP8oS0NOg;-x6-_z})pLmUE8jMM;9_j|=^_;ru)RGuu?Z^;Ko~t6`4NXy>QJ7`1(Jx>WNKjOJ_z^2v!f^sN#u zPzffJHig@2Km(wo3o;Qbwvrzfbw^vDz@XDe5jts+mKvOs6yxmM`>Zjd1uH zK*o}L48M~VgU&A?BI=(>QknTXM9UK6d-=V3cEU@m8%= zp&QoH`cIDJe|f=${cvJ`1TwY+&0pTS+4q<;>t+*v;rp|C!$KzFvClbAD2{$dQJt4f z%9vG#@n5|MT=Jo!AidYO^7x8M&d(a@R{8v~x#FYTaWBM2>6XctD>vMs@p+tyL>bRX zOzm!AbAa1LVbcHRRmfWY|uZ!|e0I76Zin=o+_eW@2ZxnV$ z1;|8^l^)uRPx3uQy*lBKtXFaxrf=`c(JJy^C*^$W(14Is&Gu3?8k;xdTL? zbH-#^a@edT-69X^_rGrx7u$_V58IX}dY)RPY)h%!EIPTeab0k@$QR-e41&NX2SJn_}Uyzmf^C#QjqmMy0HNDFp#&iuylnh z8SWt8xd(?H#ZKi8@YPsM7afX@CHDJv$khGRQOiJx`dsXz$ZVTAM#05!4~F6AMKcv# zrip7z=>mXGLkwv9OHs8`EE)#cp1Hu@!54LuC}N+N;A^ThhfUFNGbl zk0rKi5+Xd(YaiEM=6?R!H>-zt6_^?!Pu%a_b%@suK%%TX(72XHA_exqL~iR#8A3^y zMgUDux&gZ`_o6|&wLJS8oLoQ3KvVX>#iZtl) zyIk>e7BHuPjt|N8V7tix;P4Jv)>QC$v8H0tS~+tgNYcFa!RTjv&Pp6{vAbi%r^l{a z?_w7$jv@;18}@^BAcJ3;?a95uV{S%P_d4Tetc`aL5AC-)Xw+EbFMTK$D)|~{cR~Ju z6lllDq8FvpK52uX@kdzQU3wf^fV34x^L*}ALfJERErny@l&Tz`!z49&wFk`FX6yOX zx^F$CH15mlT%B6R4B7pSJJO?obJlEm&<9qNzp1h~%t~Tdf z?AmTJ?M8h$`%oFli_2T&3(jx1TGCBm^1exa)k^0rlW|p1C%(iZk{`^u@q+2CoInN? ztV7=%opE9=ucL@%HcVxD31qQ7zcJ2oUGizHnUEIlRBSy+jwOAJca*3=VVId7kG{t2 z^kX|rg^g0{8uxQTs2WP%?>y3^3)=_I{ujOWU;paOMKJ0})X-;ssg68Rii}N%0mV?8 z{Hpsqrdeph&}=?AY7Ugo96 ziP~tXlSuN;mOW{b%n-%zlx>>+8jfo_cs;j;R_6tz1E%M9enqN1%f-bGI9*HQ=Z*Eu zUtG!d>uF*!am`T?GE#hrC(L-KB}5c$aY=DCaeXc^VbeTHBD(TKfaa1Hiw}c8%OP^% zmzG(iv15n?w+k*5;S5J|VHGufE$?1B(Icub;s2wQxv$;&$ zgqN2Zmn zN;lMXm77Y?Rn@@^vWv!dDWA3%B9&(RgmQ7!NZ48C@b$wfeHIA7*Xt^fO38lk$nyl2 z@B0{Kpmr{(d#$P77R>mUFySAY0BTNT>iNj?agaclEEfXFLlP3$F&oRzz-rNcsI z$&q*oZpi8D8(zZ31V7>%W`CwQ?RCQsVE+W;Nj!{umRP@RE*1q=@#0v}bBDp`^kwS1jJ%s#VJQ$|Un263^8WD_2J%|kT zx4lQ!!VX<=guMI@`bWz4Ogc5pgP+xK`{2L-l;M52$bqLN`1Y1S5v9Q&x6^2L^-3m;1BQOeF5opXxP;D9@>*D;P(}tqoIHuMSoo%ae9!k^U+K1 zO@? z;6feg{`eM`&tUD~{fJ*|)o54V>(?Ie?t(HP9mlNz3aSKH(e$HUxLp<}Ujxbj+3{<- zdt+Vx3~=LV2v0!?(MBa?rmoW(=QnM&TZ7P@>oR+PW@YryICcwwMbjPRN!9cRuX7UD z-G;iWDe%i-??pS*N$?2zbLqenV~KWBWzi~B5&sA}u)w$zNXq)Jg=7DCxuxRZ?{;aL zG=-uCCA%AhVC@Q$jsVIYn6p^~8*em=ETamohxN>Rvgc+h+kU-=3nLXs`v&IqMI2`Z zy4Hy3y}#UV2v%sQx^t6;75VW}z#(rp*>-;EHvoYMc}fQJI9*5lw`y{tY1Kc|WfXWV_3vS=Jn7<9wBNFbQWo{So zG-f|4!TFo4N9CSlG!6{GmP&=J`%hqxzwil z0mwBIQJ;zc0b2=bWLH`_$A3>2P$)(u_JN?xs3II}?~=av za?q&TVgG!aCY}98fczickAEFKaL>Cy$SD1!z&aiYh}IkRw`po09*+df=t1EoGLph1 z;@Tnvy}2Fq5w0yMwKdQ$Cj&!uaW?U5&|L;H(qc0xXY|pulDH7K-2*66ST%v;$5D&7nbRw!NtLCz#z3}HDA!+CZaAFuE>gmcH{wm(De#T zNN9dH_B7+9FU3{LP<*BM+5-$=9Zi@bYH64Dj(pJzVUgeE@JJ z17l4k9CuyLM5U8eNJ!u6?N5o;m84@Q)OG-oHZe|yXDu{SBkx_og$p9-NfKTbc*$wP zEDIUSvshV3Nx8?3Z@Ynm=hi6@6hw(-o`&Y$zi`S*e)j~ z!FNXW$0|%_eYFR+ z!79+Gc2k!QAbz_t5=OkSlTzQ~GE~x}gQx@rSOZVqUp)EkfcFlGKb=8a-W^HAtdy!$ zxdDE4d?vk-tFI~#^fgkznb$@TT#E|a7_4EzP(kXo<5OZOPj8JOm8sz~TK_s1{}-@i z_rMgsjau!Gm%qNr)Gb#pvL0sRcbX3aA(|B6tz!UwM?goy2(XcAYiF*Qh={j4$ZC|! z9L&&qAsyGL6coO}w}wm_$ZU@kB=+)7RA65%1shRQDF}XK9vmTUvQ+wtO#+DsDAAw+ z(rpdly^VJrI1DEGP!RJyWKCO{IjS-*H`6)d&&FNkwq})~nh|2#7G>$N6f<1%m~Ru2 zi*S4SKdnC1O84IW3<|zu=dM|9&(5q)fXR*qq#y;y#jbI4fd`wf{U;hyxh%Synvxv> zLU|DOWcjF*{_(Rs)L=~J*y{X=WK@scJ?g}Go}KhIJ$UHwajIo^Yv|DLR+4b0U=|ly&2?{AYb7GPe%)A^GCteOcYl^@7$^B>QEFvl9ijQ2+Eo4+%;28 z!Xi*asdCL4S(~4oCx<#fsymN}DGtD`%1hQ&U&85B2ZZzjZ>xwoZW7^X7LD9$9Q?WN zsWow5->+A;T|4796X{;T5@y+-}{Z>Ecg4tQ{z0>t(6`zA9XM?UtKZ1MW_sx8Tmp%E9Q#SnoDt-GA8#sOr~TtfS7##cCmrk zwYL}boe(nDk~eR9Njj3gz}JYvkOCa`j4!|E!|`7*{7Q}4WvTgUbEcZ>?VfC1G{U)n zHn!PuQ)uI8TzI42#pY|)p5~{eF~Gf7r85=R33yD_r%oaV* zGB@x=d*8X!jOcrt4y$a>ooC&}{UC3rcIinQjvfqkKmW+cFSm(ZU_4bR*KRQ|_QR#alaTI2TXF;WIhg zvo9Fw7l4>4Q)WUJa!B$aV{J2k-5tPKqc6F);_cCqC$nf)Efx8olhygqSwa!WI(}E0 z1LLMV^$mr1&+YX^@TKzgI8H!TMHs+aZT9nD^6orM!H7gs6((3D5F8T)ccDT@iS3zK+@-2m!j7m6J!G?6XXa zS_Bm6%~ zV;x501N~5FV>h?K)E%uEG0?-E>CnKK#*>x=_`p&18j-LOSkv&1>pi^XZyjgUH!oSX zF@AZenxQQ3;wITV_w(2>6|*Rtn#-Gd*FxEe?p9ZCVY)QCz|D6u&$gzDAa12G9+$O4 zuKra5wdw`i7TS?>yN}fr@jgwj@QYt?B?7FU2U8p&AV`6SHonYZst`nzG4^E+gNV*4 z&2Fq-fdwG3ohgg+RXZN7D}cG_cx?6$N86)Z$yoaq-WEr~b7puqip38nI;f{ZRpgQR z81+zN2VrhHPJ@o)KY&{x-jdXD`Xa-Qv!-GqzX2BH?>LFD2$G57qn&jbmo7-GA#n*3 z*$*^Q=5`#`0b4f8ZRTn1pwxZWQlayrg)F5&qF37nmag0amsuYB8TvTKh6<#RP5R1R z6hiEO#h|U&TEF1z%OVcv0blLx0*@)D&LN+w;#HS5ll*CVyfJm-b6B27Q0+5WFx@W} z?=K*PmC;EqUO>}(ri|9`sEx8ZwdT+gWvA3PyVtlpSse5Dt}?oXt;| zJ|-IHHeZ_WvShp&vfjm))Eh#4AAs**RnWYQ{wxz1f`h`?*bfl^;wEqLi_QAEg_FIL zkVJfOX$cZn8NUWZ_eMHgp>mUfeH)QvP~BhZX088TO4wmR6!?S=h^geNn=tt=UQ{;) zq)gz6mKy!eJ9oEQ*fR!>kSlS>_6vWD_V!@_j~z>=Qe$FgYDBJ2_*l@R1JLe#!DBB4 zT=ZQF-MG%|^Ic1zDf{pt;%yAq*+g&rt-~#7I5})qCmH$14Nd~V<3+xTe6`8|xeD|w z3rF@7oRT=g_B@a}?3$Q~wJVSsf6W=wPJI-VkH^9E=!Jy;(2S0<*arC(7FJDGUqK4YB$G&h+ zA&Wf>I+f^aQyfpBV+#Btw6->7n8;Wjg6d%d**FQ-&hC?LecX(9pHn^$o$`Y$W=^Q3 zw1YHtUMvPo8#orJVSY%%VSk;{GfN#dZ!uDT?C4aU)x74T?VYF}J96r9gC2JpeIJPN zyIvRvCuiv8J0DBfc{eF&U__3y&*U#l9!5@0SUVaF70@-Cju~$S9)JGE#optA_aqo} z7hCnzgQ&x@CK6iVOF^Jq7-$iBcbA_fLkVulbCZm8ISZjKyrgW36Koa>^yqmlx|D$T z>n7^_Jl&)k_nx_l{860vtnrP~J8)#%0mM$#M0?`NB$K-dOIlz7#`n%To zu#*C%e(a7?4-0QWBMaLkmx7moQ+)=ifF^qnP>^?kL)@M%TqJ#jDB#Kq=u=TZtHLJN zY)T^gTB^)OoLvLs_(>6l;y|I*mxQO!3VG;&bZ7Fnm19u>8mQ+Cp^<7n5#(z7*KEI* zJ2pTciW=Htj!nE@D(Gtoxk?`tW**K)VQuX>HZWLU1JP%%IDS)S}ZN(~qV|M}8@ zK4>)BtHnS@AvocCJcRr{tjmlZa9?VUm(;VV!G+Ex!MUaPFbe#c9VqR*#+AV5+r{qj zlb|4>fw~KV%v0Rg7vqa27u`Y5xDrcXwS-g`h`CVlqYe}M8WX31eQ2Zn^Kgz;S=e`^ zTq#OGDK4m!v$)=Tmek#z1IjLGX47TtZ4&=6!Lh2Lc5r0=8*uoCfBFXgG|~j^cMIka z^q{%jtozd>%w;RJ==V|lPE_Z51&M( zCM8mj2{ctJs;P-*;&jKM-98>QcZ0xOcmky!!p=BR=}0!#ucFdg@?zXOOF}!epr|9# zEt4_H%77Fje7(=Nfg%t=fF#rKfFRUVv_t6&m~N!n5r8!W7%D6+Fg zqYzaB6`(8ifxfFRx0N|&LZc4`~q zGE5%6757z6%w}V&-NxWYY9sSB$K%yh@9X@TVg4I)t}812HUPHprVYsO^3nH-9PlnA z^Is7R?~Y5f-RI(8kgnd4bS=;<2*Ozo$}+^s^la;5)TG%)+ck)DAZ(+Lt1nuKQBjq* z90{*+GJ_vqs3LgIeS?V<+{|Uh{U#oWUeX2+R zU3H+M`Vnf%E6k=Bkg~lgUa-IOEOQC1dak5ak)>J01AF~qY&paNf;Pc!KChAxhQJ(u zw!5dS%0Ui=L}*tyMN4yXpux$8O*XA3?(y-5kOl_>iWW6C%DUEG7Euw1Ub51gN+BjSm4O<=8yjzBr9t!=g;dT7=;}#Tkx4?(?9dDK( z$3_mp36OB)*b+ZW4$u>4fyh+8@3Sl1YT89zN8a;y25eJFLKI?tlMz~A4n8bn#s(?D zcJdc3*ETKfH9g(*F)QH^dD*P;mS<(gIu)M&e5VCHg}C{Hm|=GaaBK-Qowt|;%e^c9 z`^|Xi9KAM*eXjZm6darv{~=pW3=g#W94oQV3>s@*;n#VIl>oz`;_7TKEu0V>Sm^*q zJ;bDP&pgxMJP}2g%k{n!5)9i9C_(sfZA4FKnNAE_>l>p+GK6kvrpg>7fApnLzq?5q z3Ug4XdfJvwzTqP{$*v}CSwQqbiI_&WEWdFt&)uA&C6GET;{?whHVu9s?Jb!}o+Eyc zN=p)|?rftiMvIfH9!hCoCm>a~1G|dggtPtpNp6Z!alp&lLt%OCCUmMK9TVXp(8f!1oCmkD9Lg`(U#0@sn&+$Cn3(;dk@h z1GeEl#QUY|wH7uk8N9kTjdwu|P67_q40L+10{_xeEBT$c?g-*vE;l*s4PaAC07!A! zM+sCO9Qr_XjR##86t7-DMAtQ+!gG7J+SKGQiH0Of$Kg?rpfvVMs%sQM!skFz)usV6&_^M~D?i$u zaq>lh=u$t*8Mil*;I9N1E70I1nEjeg0pYZMNA@`Jk8&WKs)DHZ@pwu3dC}vAhlbK_ zeA-O_p$>_RoVZoj+|0coH55uk_7_?Ct|G0F7Z(?o?h6wYWT)))RfOyDctp}6(uT-~ zDeCe_23(9(>>8|tL!oATiVzdQ0x=N^W(F!(Ay(mc%8NZWWC@cIz|Ug2^6x(-ECCN> zga55$lD$3y#9yIXLGrZ!Pm|~Efbs^cX!nb8S7wA~HoQ;Z-!blp%)9VSdG9e@xDN&W zZk|u_{`0Yf9pFJqvQIj>_r)?k!Xv`4d9H<{AZ8iDP>B5HsOjk?fr6SUeYBSR(Dxw7 zmTq|`4XL1uKfp(|UyA!Qyz41LvGkB*6uq0`zPPw3fI=d^j_m3OMhq7iA%>QRM_`}j z2k&u2eH4G!YCd{`dtBmxHy5!p%#Lz0K$18sqyh_j^#1@BcIg5nhC`;>!33b$#s80? z*$wTh4PXPEGMqrX5G;q`&;qW1_GI1on(Do$JQaJVi}*`GE#i_6gU&Pa#Q(|^2WtM` znc_1SlRWPBgT(Az(O6re7>CDiXBR*mMo|`;wjWg6U@GjNm1!L>R|>KBEZYaXN6|s` zl=)4(_z2_`AjEzO_5yYRIbGNBUnDV8KFcX$sr$kLSiaD>+$g^{CtfQK-B$>rU4Vu$ z1k5@0hHK{e3~&v7`urXU)o<_dVRiR2%b~Ztx6c4QMUU9NjuaV579?u`IkNK*$dS$W zl_T3BnORR{Uo9REHWo@g7@e4CgrAcFkzlu^w{}zxA3Gb2@+n|k83h1aIaM~w->B3} zI0@%A;pt510a4RPH4J%$QQQ_0B+d}T95*^X9&Le=prH4DKhPxpImi1Qz-y=0f5YtD z2%w#sAvqfqgnEMa8}Ur*WS!W1p7{_w&kjX!{iDEnDFfJ7ZU+(K&kMkU?XVgRgG&SK z^N`i_Q4|S2dr^)$!m9>;$r#wQLm~C|`6~3)Vt^-nH|zH&rUlVf&3Npg1CRsIQ$ z16ZzA#M~y#RctV@;kj4>WHk!L7(6%D`; zM~Dhc?rE{}yVQHIxHljad=|a1icQX9*{vUtiCorBI;xvlT1MGfngtSoh)@*i{HTX} z*Qhidf-WO#||6W%Xb>WH%0fh*& zmX?+(LsC1@Swb%uv?K=3Gq5gyJY!u46Om;@Pr8y=Lj)`H->m9Gpvb1f*mV1T|BF}k zA(#%)2jcf~+TW}xnv`WAu44>ELIY@zJN)i4Hr3zUsY9iORt>x$E zyfjofT2k8ov)zXBh!pyfoo(g&~e~+7M##t6=0E1DxvkW(jgBPqx2Ym%IP@ z>jms7W}-PaCXAbK+9=?>8cf zk?D@JzZ1+g%oLSvV35Js?QE^Z*g=f{m5JG%Nfsr9s`1XkOEGy=9!hU049xv131pL? zfQiCLn>?W4zn8h|OT~H$6J$qri~IQKD2lV0W)T}NHKjB?Z4N{yicTMqIZJDR1Kh4` zG}jHQ(lIjoAdeh}Lupd|f#jS`P8vB0nVGthNz*+kKY$q}Ta3w1c$sihAM?hv?OAW) z0MjgOa3yv-RP*A)g-8E~@BB9q=lveFQ*X^Ax_==}1iR|oXQ3Ok8xM<~L0 z!P@_!G@Bm`JZmgsS3wv=F5sM-*7SEVjkkBT0dT(oIJ8M#&)F}RCtOx>3FRMXPs{sW zqzLXPTYxDxxD46w+EW_75aH1SpCuewd0q11Fv z0Nlzbw0n(lmDmQ|mftHmp4ZfH4}|ZLtLNaJ>ClY6AlC zGnXd81*+Mgv|!O!n31Vj@G+ZaRK~g{Cmp7ek^2rmyy~Pe`R8CX{ zNTQ

  • P>}6@BkYe`{9?4N!IW?49%JwrAabfulQw6snn8e%9#JKGiH)9jcjm1?M+2 zZ(-qOYmmew$$T>tHeVR3IFF&lBNcszpMVEft!=>r{*zpeJQ+0-8KC73b~sPZ^3uTH zuzy-RccI*4Gpa8F7`;YV;@q487|#v3n6r)U&<#)^T1L7ywG*1$f55;0{og_^LN{Pm z7^Rta=pt8J;?9BcyZu)>Z|I(w1=95yi19i*&30w2?ib)4GJu>(BvB21oDoTHBDZj*G5*NuSl8INWP`f3?>;`*OD>?&GlL^) zsr(ltaG||L@ouxeaHC{AJSus+q9P%*#7aNlSJdpr;PGywO^2dlpdi%Iu8uXjRY+eX zM6=%47j1jrK<^XKrKo)q8wonJJlB5{5{o9xX8$mGyujp%)q1z?%G(x1J~Rc4s>Sp=()*RRjwPOSmO9lu(`F`3r45>M3J~?rGV$wjQ;pG z+Hnb(9a?_4EAUVO#=)HyVgs%;I%R=W4bDZY>_{fH29<31i}@z4R1*V#Hy5C!`SSwv ze!{{o5@x~j1;ee9ywH{dFZ67qaN@zYy6}sC9eSZUusPp(wE#zE0Z!bOz@|OaHk8~R zqZtKm_uKNPinqb>`<(o>c~|;*xNC3Djjf}wlw|!X592GZ!ILeu%fs@b1_seh;T~E0 zCVpIygje29hcmkyrT0g$=}u2Yvf-x|47)>CRuYhWcdNay5GPtsdQ<0oicDlgeJH&Q z+#SIWXdLm}UP>4P+9-m6t)yBrM8;PdqJw^h0XvapN-Yz~><0dKSsQb>eTy^tJAEqh z$GHSh^q;-nK^@lwH;c~HCoEcpb(jDQ&RIk|-1!*ztm^j|`?v?;m#MX$A#86i{3}yO z%_%y7=YN7P-`g{%K)42lz3EyQM0jhp@XfH;>%?yCET;+8qX9t>dhTtHS2uI!1ti7% z;W3Hh?Jk;XQUq&5@HTcHbOBY8h5w4w{O9`P{SCSRrpjCDUC4q<^1{EX_l#AE$PnPI zxsgbihfPKCyo21H<}O-gfDFbuR3MWA*)hFkJDh+GkiE;q*V&1p+n^&qKL&a%hP7eO zs>u_E6m|K&)bl>B`fy3cz|3Ty0Gf2@qaQ zr4IZ68EpF66_GK{R8q-oAxjjDg}oVR5ZJz0M;r9KI@IsXo4XW85sN(r2zi#uQ%ZP2 ztkH}(HAt(qx2MKS_$R#9q{Ek(&3{4rxVx|ijC*byuM?A@nNmaZpXl}Z8Q7-r?Duc? zf`JH(N{Lr!3rmX3XsJmp@4eaQ=rzA6A8R^nevRK@(jb2Awf6r_QD(K?{@`41B4A|Gf&c!ARN60pnhsN2e^hU2!+%#H7>sz$3S;% zT*OI8=C?WbCH%cs)gf|7K{xP-K^v|(TLn7r=g$sj>#`>R{Cm#(TV>ksPL+K&4SESL zBc!i53Az!(-2i@a)3QfiLCAOJ{KIt57+f#mzp7zG0I_g&4nW4g`RoeMKw=yCDg^Jy zDeOJ&B|%{)tstdCLA)r2yMG_*s$jhH0FS~*&$E|<;Qs~N z*DRb?d+$~xLKd6JECwLLG*NV`%%Ny2nvpwz&0nJuh~CpiL>f>EnRr^=qKi1#GC8P~0^-vA=x1Yq-;9BmFHSmzLt@pA-r!2M&_@Mog< zt7l~;rSVDO574%8+4__=F3peGvT{UoxC4JyGf(C0~k zrv(KCw+}z}3gIK@0wP|J8v{%_qxHiyth3^*L<-fywc?;@3I&PO4Zxq0AulK2diWZw zMS1R$O=ThV3P^LM;#%EH|KZ!R9pVMu(G~+tNY#gtXUn~fHG1#s=_Q;8_lNW`yY1DP zSp@*{eyB=;!jZVs`w%|}k!b*bLy~kez&d5uP8I)vDTNsjwpG=aHR;1!^G*O`fI}%0 z(z>(>tERgDM{xf6md19d{|2&UpcP5@!nmGS@kKvyP=sBl7DVQ(6v@|SxO+ZH2e9j5 z5kt7naJoegxfa)juR|a_<-Xm2?r(+tcxniQ^I7t*$_n@Sg9AzlWq) zx^d7l3-kgEW47?O6Dz(b==5dLL&vEsTg-SC01jT(VV|Z}T0!`wvBWEsE(#e%XgQxP zl?;N_4`U7Eyq$-XNPu$sUoO+emL7N(BVq@h+O3L!dj&~zi%N*JMcWBG%a~fd{N$$v(zJJ zkfW(`>LlUbj!NMd&!J`q;6G3AAG~$AT;BM8K3&p|4utREi(hrRn>r?l?;la0IE_$h81l9& z_D87^H+K7MZTBKl`N6~~UdsyzGY4m`jzjx(#PtL(<{e5*%5nqeiXFVu!4qm3%Ga4x z)2Y$m4H17*SF8q18(>{)Pm_UarMt@n6>EWE{Ry;?9xXEL$kgOS>nZI8JLHr4(pP5e z?!e+eZzL8(OE9K2Ab7L&CAT5iG|$*Rd{J0^#`^7fNFK;z)P5;JxDEs4C=5&SZgU6c z^kM29gAGHRS3tJ@$4?asFrG8^1z6X2<*xG5NS4D?2V}K2k96h(__eeJVd?DR3qlai zn{502W2)YkJdEAU$q8CF3X9Gk9~DH-9Roh@eWp+=%0hAffOFKT{LsUAj}0?&9vg$Y zF;MT;gO=6_*}DXL!0GTo%js`mBi>}H23V@y>~q8bfqPG&2YiFL1Y}ZRZZ-T6KxzjC zsS&t10i*2ldA@?#m_t53#o|RjVS6xlYtN-9x6ZK-AQ-U_KIY(gF0}&UpupC732Q#BEqMu4nsY(XJThz;qtyRao@rOo)NSZ&t9TR zjE}jiz6f2lX~RVwN$M>mEl=@OFg?G|ONxIoH$t2vhSU{*@|^ZhoXipFkq{X+d<`+f z?G~Y(-_;0O@FKl%M8liRUS5dc$Pt0#2yBdVn{~@d`6*!ZEkiq@6YOKbWw1bCI@o^I zex>t>ISyog57IO`%yc(*UK8|Bd!k;)Xn-3p8gKL<>nx{J|18ELBF&3iEX>T8bQOMf zmN4Xzi%sVlKjP?FE^~$SP`Vf2Z};5soGy*ycZ%M~B#(X1dNUpKg@1nHOLOD@DGUxuL$bAy{0swowmq7*GqZf?7C!f3@%_n{8ZyU*~>( z&*3OvC^UBe2~pFshSq2eX0~QyGH#}3;H-?dsgL4Mz?087P1X$cjI;v1d=QvELJdx} z2dtg2$ERA`x`NahYNG%aW%)6}osweb{?m*OM$Nwr1#^|nIy(yFo)))81gfF;!PV+3#BgLOdla2az7S_^eL1-NBLh1X(TAnzRyV6`O) zIiIvDK`CE0V+r1lO!6m9wzhr0;9@;`qneAM+cH?ofK8~ zx*vG{X_xU;Ns{+N0tW_OF{zMRK(Wytse@8n&zyl+Uf=2K_r1e;@Y$c#s6*Nv7}8Qp*g=7b5*F&BA~okNHEqqt~4{&oOtGd@8`^o+zp=mC%y@ zCUx zv7^)OQqWi90Y0UKOo!BF(+_cOFH^bKp}P|$)920`RXqEJx$%W~^M|7+Rv0IWS`s(Y z_~ifC&D6s7$)K;StqkFk(N{)mR>2vwMY$o7UV26RO%8N=ay%y{#zGhp`l7Z+u$(O$ zjBp&dsu;4Xgr}l|TbGR(7x`_!)HheOe}}7%cZd+;&`M2f3p6i>hi*T4k;H|bG-SO# zlC$EG1>fG~uplx%E&c@Q!>ZK!=%0*X;)*Y=vm5Mt?h}R8_@P&&)6>-k;Z=-ZZ<6V% z$?A&dEYp<_?cA-jr=9?Ig*$-Pd9Ly1DA%2v!}D;I+p8Y+*d(As;=UougDKBmCpuuD zUvE7MyJn)MZ6iX}efYn07zd*@+|$nv&CA2LYp+%8Gj5Od$vy|#C+F;-Ff4aI{PvV* zMj`HR^Y087Xm8S+BCBK8{+=Lg(cC$RXh^B<`;^*zlBykbXR~Lxc9t;rF|wOLI? ze8rw8<%HyU(bxk(Z1pYwkeU3V%uf1oh6cP zl2zP3J$?dNUh{}_gWK@($zJmt<2s`^6o0>cJA$PWAnIYBLio!9G9tQCzAk@i4O_i{ z88Pa`U!gRAUK@g(gm;$W-`Vd>7hbrZTiBU=@JmOh;=$|{Z+Eq>C$Af!;hev(k9@wH zEh98EG#~y0>F#Qu5OHF>SdwKH@g0`!c~#5o-N2WQA7;NbT;~s4_e&%1C)Xjl_-}m2 z=81*71*BguSnXJijL|;2)is|kgTq{a$9Xx);G8#7yd5#P=4LKP<&*x>^C2nFQrouW zz5_aO)?1&jEZG^J{n*q-v#(V2OXf?p_H!=dnsUL&0JSUJlzJ1Uoo|)mr(Rw0=~#l| z7`4q>d#t+n8Q2oXamwxCKDUlXEAzct22EPM=GTD>U?i+gjP&*>9<*mFOClFdw?Vrznc2jPB2K|Dxo6@P|W1zpL{C#FlD~ z>KnSC+EJP9_y`P2HV-(Wo#Xk94e^NMXtT(gJjvVKj=|X9&31p5Lo&Mr4o}~*WS=Ib zq`WT{X1^H|q z^n6^#$`==vF_)agvWJfzwjaGxHFI1TyQ-tx zq2rf^m$cjt2}xRainl;dVoYU%>PV*C^x0GJjH(>Rk(zyYel{LSc$0M`=?HIaj_pIs zHBV2^*Nwqm*=`lx^W>r^iQK%pf}P$u*)sXWElf^C-127ILRW2oyh_o?}Dua0j}!2-}O+%ItfeBSHh1Io$FPOs>%Wo`?0 zG9KBQ1-II3dc)vSAF3+4XVA5kng_2>gX9I-t8G6G)tA}K`2 z`P=mq7aqJm+mjXSo-lu{)6hY+UC1u8EG=mC#gs<^tTl2SvpOV<_w9aUAV`oK2TIP8 zWah|L#KY{09}?2%4dKT2ZMo`JOXb6+G;FfV#~uLeOeA(5k`+^G`HLI}UFa+xHCr9V zFLj^H*&lF2DW;e}lXK|tS#^SRK;SDHqb0_9Tf~5&!~2qJZS_XKG?281 zM*+@$xZ9VE<1CeBPO=C>6}(?=&VhwS?54Phn>!CbIwjkYd(2kw{bK+roF@9bX`EHX zQUh>GUEjGdkVpd%Z_%?MiT6O=2G)S}tFA+L^R4XshyGB{i0GfW$I|4k?i2`0H>oYB z+vBf=%-Uo3`iXN>f>OQ4$B&ky&9p6EiZ$EgE?CioTcXVAG@to3^{tig|7X z9oV#5*D|e{|64zzwx$zUf}G$zba1OwoGCj(ovH@6ij9}(EDG`eeBuArFZ>^S&c7bT zL;tboOb4Jv{>PqEsque|J;zbAZ_e*5fN2Kp{}qXtEexxdBj6eBje774ry5uDy{yKD z*#%4m*mYb@XhFghEGR?;Rf`r597&a+fzs)3ytzM*9^FW4KgC0(DM9N&DAYSEj7-f~8yRB2Z*NIpqv?AhY7ca2Q z^=|hwMcyHYknpnEen6T21mG03opCxBOjdg@9e~z-H2Ksm?WJTYSWZ0 zEGKu~V<(_i-?PB0DUJLZ#f#$#PRo9a5U*U(h)${n$s$&i?)n-}74$EntoKNlU#~}v^ z&9dp@Hdn5*AAWVRnTpDY92>Xvv8cHJ>6ZTMpUK_EW!&V)a zM#_M{G~`_FyMSOlJ&b|vA2FfE;E6g}G(SRd-%p#&vz_mw2$O5~>C&p=({$4Fdn|8A z{-6h|DBq;>o`04f_LsXmH^~&ngaH_e3So9?LrBe`92-+2s71~>*S%a4+?I;uKT=oh z2U%#n16n^1pc6iHe4d+YZ$r+}A%P3yA09gE7p@h%pc=641Rt6mNFiy6J>S!>(kkEP zEm-^xc7iq>#BY;&Wdwh0kxrwxVT)K5S{ zBPi}W);hgGw~LAN1{l0n^r=C>DyF~tofbjUemuL61de%+?1#@9RV4>o_BLkynF+%T z-G|UcrdP)S`VvD6b<~S)%VlmuD~UU%@mZwxENA>*ayuur5SnX*>AfiXiA5VZX2Qtv z3{_y+U=Kzr>@yI(0@7WDuWd!C6~!simt;nrT7caD*iZM&>+n@cgvJ1pE7TXWiL`PP z8j<*9CR$eI`F#8QHY9(aIkVFwC`2V%g>&Jkh9byRxP9+D22-O5(&{?yH)ux;DwAtS z%z9J!`nPQv!BuGGn)sI4NoyuD_ z+|xe-)7bXBalFCTCXXO$#;jwUazmAqb?JkRJ`&SLhWOX%)H8~MU>YPglic$k#iauK zyI~isct9v~a7J`Y+D^kC3~V?%TBiEWFv=OmmnprJgHHKAQ?v-0SlWQXtzvNNG}&#h zUG(tSBWa=xHn?+~_%?3r7WwS!kTdzCCrtl!eMcW^x;lB(MckEROo4Nk!xdpUMxx!Y zFznB^>0>#hC8LOQ)%uP;CX$~(K(5mf^kX;l52X2o2!CQF&&rqPuyLVf5AF&W6#Ec# zO9AnHO%a_xA}+MhG3k(=roqMRwED(_S6%a_^RtqUrmvjk%eRpHtni5bFID=FE(ZEm z)P|U4cWtf+3>Gl7m)^@#^IrEL3FIJ{+_&trEE1mMWL}kE=SqSoPMF75m*yJKR869GQ{a@Vf%yT?2NcgARw`?bc#PthbGHKm3|9Pf-k0X2 zy*O3pnSFV-wQ*Tf*5jofshhs<&DW%Qo*Vyk=N9|fAaFfs6gCVF)_;J@30cOS-J<|I zh;1Rx4671gffd!~4c{AqU>M1&Iso6AFIJ|IXi8>`)IwucqIyP;`Eu*_IH-A!UoJKz z*-jQVN7@MWEs*14CdV ztYPO%K3WyL!}_Tyn-@`b_Jqt)5zP>1v--?~aDwPJAzn-A`C_W@WwfAF6Q#;DW?AMp z5MA5)$ZEdowyw=8;l14>WSJ*{D1S#^;_i+RE!~y(*nzDmgARUBzlGA!c zT%;1PmMDW!ITLaBxewe0AN3oErkbcL;=VxYo;rHp*NXW$I*l}CC43LF*SNaTj1M{7 z*EBGQ>ny+O_JpW3h*)4e#gjfsJ;@;OXW!l=Cx(s6p}P;JFCi7iJx8dcwAL(V{CYz& zrJuWW(G{;*uy@A!n4u0z4jp15Cc~na!sE$BZR%iW4)--a(*y+T*m1I}At&Q0G!Ajf zh7RQ#5urrs-5bq!NeJPfImrq=2#kK`=Z<=_LwZDF;q>gwcbce)OQA35q{lQRN|`;% z9Rj%yN~fl!Sy!7-NAX;3om{j_H!@0c>!u5^Ig$8hK#ggioZ&|hi;@{{p6cIw_a~Zv zv;}>>zn59kHrCf;N-I5fPi-1BS;hDB3Nh=UPvTw@-?tkagj%S-ffnF@Y zKZ8XgGPfyyt)zNNxQrv)s)A$4YIBI{E^V)^yu2mFp`UXE!~1WjQb$SOYO6m+N6orD zNW}Uc#3-+=?jpN96`Z=Oc6ltz&R2$}xkbXqhbg6Ub=Ay`R#k~elT#%j+OpS{pWk(E z1RzXtyeqKw15`MVH`Rk}jU~v1b>&~7Zl^XTQ8l_f8wi+81V_KER z3$ER92o{r9OYuVa?-q~aocYpeUy(+F-kVHc7Grb?B(jbN9ABV0+5jeSrxe51O)R?D@my_Qscj2`rxWSpYOt*#-BeCG?gr=YZm!971xHY^OkKv^s| zMx6U0Fozac*PmElK4LBW6veTHCn`AaS8z=6T=oqwro;ShJGW;vQGi<{if+11YFBF=rd7lb@AUsTuFzmj|Sm=jC&fNLZ zb}AgYqg_zd&TxTqsCF#HL(Ek5%B)gYKnc&0Y~bsKHsj#bTG0kjYWV{OmsY%bk9xq| z*&5ZR#y&+5)c__Q65RcYxidvMlvrRo-#_1Sjqlqa=IlEChB08jB)1_ILG;F)h^2c} zl%4j^y4vB08S?@2gLEZ^v%(p8RkEnGIpqBqI}W*Zb#Zmj-p8d2+7d?brdv z!o=qiREDC5WMcM@2=NF;O zV*oq| zo;fjeydzHF2uA0E%y4^CsEJ%52Z%_q*iG z7$-kZmBqutjO3vMo)<}~QIU32MTn}H9t0LQJZKgUJD8aMaH`a;Kz;^6-UDFA3TpPV zuKz*AjT*Phw@iI~x#SR3f@(MRjUUO$ji8Qll;zsk%;2Wlq}I54#k1K?mHgOtZjIS} z!Dzf+0cUiXZj{ic#bS9I*-4J?aG&o(C9S_FzH1_wnuGejZS3h*)9Q4AANpM-(AO?dyy!`W*hDQjX+ zCZPdwPpR+}?CG+BQx(ikL*bJKUk<*+91xW+rNj>GaS_4Pq7mV9dYV6^rw^7Scb%FUN`ebwMLfp&t^v0CE2ZT z6QV-we$a;rllHdEmX+^(IOXbi)jSXFHbd)^NBE&p4jE|?jdDYR^vI@)P* zictB%m$tkY7P;%i7Xr)LGs*ZMjk|=g5#rL4Ct0| zCEBDTw@NoL(F*BqM|C+<ZZnr%8yGc^vS>J{96Fu2jE z>4VSR$h=QrZ$ns`^tssLA7SpK+sydIVasz*DcgUY-~WVQT)BrceS(Gk-AANE6S@%T z?qoi@_%x9*v%IOv96f#gtDa#Lu|Y+Fx|@&FL;JJ*mVUh^8H2trZIh4b2goG(LdD3) zNQ&WSzT3P3kV_%?5&fw*C^Rojn`&ubF>f|?lf$F8R{Ys4K8yUkIZ|9xdq^{$v zbc}ei1>V5Bdi`9pbYSW@v~YtwPJzC@zUh367R`+Z_YqG%z~>4)1k1i9zUQ2Ax^$aZ z2@!RZ3Oi*q51K+fXw%vs5O*>Tu?u=hz(cWxrM)ziLB)zY=c=^Np*pY$R)ec+H_J$| z9{|0=(dDjxZ01#VDUMgyu9P~g3fy)=2SRs!5eXE}XW?C#xA@dA2Fvm(CI@~feO)^& z=D6uCLw1*ZSWvsfkY(R!aXw)2m44dZsxmhsl>XW0xiR-Jj@rkJUc}8Kbn3A)xdE%rvb$1xH*}!ES@-^1I>^{A4 zr`-NYCm=!nbQ?TsG_Y5!d_JK>)4VN~c*@tK4RrxMPv zyty4tr<~e6%VB<6H-7@TZK4_)8e=tE&1TDOo20FgN`M!W@nT_wVZJu*fz*7*c{EQs znTDjwc&QH^%O4ewbyI49ljUpR#i$uwN!&cT3!)Cyxc>8}{&lBj!X;=)KSq$b)^rvf zG1hImgGFht!-gSfY8_WRGcttQXudOoEYuakBY30d)Q*=zN;7nzoL zlgyPv|8}VT@gc<MR zG3r$}^JTCzx?r36Ce-1DHLRJ6h3|bHbcC7q@{EPArBK*4aPY6-wz2Q#nX@F(<%Pj?+zSd6Fipgy` zX-IpqB;ud<;E%VBnqVD1p7ZeI-@__-Xt+SYjQS~QDibZeHj6p+6$#*;NY1J_Kn=)o zn+EjY6Oc_|q{xsve*EKs2hH259^ofoWIcKu8iP&{{MW)ccxRTkZ1Toy(P|eEzP12c zVl05IRKU~%UNDXoL1|Ca9fL!eolk}4zd*D&;B%ZKPf5XoN5xk(=~xKO^jhP^Y9V2+xsCO_uKD=%Hc~3W zhNhP3a3$Lh2PN6pQhg#~`$~`}1r!QNJ5c~WrjKj{e#8b;=<~uUX8!5LNrv^u`$)f6 zJ0)zZ`+_9_Y@#+nxzKlbeBk7hp5r#bMB5?m@A9mT^2eX(pH<)$iE3~R%_=%TO|qCs zu6GYln5-doU7&}wuVmOOU7thDUwp&hV+JtDB1lqu^DBVoZQ8i;r4`asp75hN!sjyM z%@OOW$8ehj&8dXBNy`b%QCK)!IjNq%ZzRNokBIdmfxit%eHpzK>W`w5ysxQ{*vO$s4OWg=IsEQvq^S5Duq>TgUd zqCezPvf~7xt@$8N_>jB%Zl<1t=_?MVuTokMn7)c5J7+U}EhXzNqfgOY&JG*wTMDiS4!5fCxkaIA%jvTHNWUK&S`mIgCl$J5Ub5pb z{Q2WXejMFItFMMiEkkOYaP16Kt!As#`h8>YJ~>iFQ7?iLZM}#?uvzvioIKfnUW9?_ z=n(`+c#-TiegT1>;T@XH*GIyh^E_K8#dX5Ekakn9Bv*gpt91s^& zndk3fGCzQct1F4m$OfmYanZ2s6$}bIzAey)#N;WzcN{);$b}r%7Kb|*V`ZH zy~ja&)FC{L|41L|D7K_|y9&*}hzHNoYG`IN_8hmbSY$7C1PmQ*IRBQ-z=pNS4ZgHv zamry^$PkUBmLF>a&-$P&}6U-h$9+ml_#(7IbBrc^zGP9b#)tKufvva z`=yu6JRCA!3CBX%sj*pI0Ub<2W}#x=UNJp5uYUjd2fqxH+Ln8^E3FLT2zv}WW$1^I zTuMvKdLL4XT-lX;aJ1f1CEQgFs9NvW+mst>s3ok0O)Sd*xrm!@#HJnTn#9_m`0O0x zS%CbTgYI|$WoizvnBSV*w1j9 zkAib*P~m0yU9aP~=_~U60#HhE;V65}sGseAxzbs_9xW-Q=rrKAyySU9Fy&K@QNz}U zPf2`~FG@xaL%QwMq*lBm?9ze2vK*RcwZg-6$UL zlX9mC{%k*mz7ut2;kM;!l!Ag=fYO54{RL>Pk-?Ux7|z#ZXzDQf<#AyDR@F#X{H}Y( zTA4^dxwiH4Luv#~=Q|hQWZ5t@nnF-qUFgH(>LFYoIselMGk&eUi|C`X5$t|@OuHGs z1`z!G+|6r^v*G8#_z~Yt+@3tE-L4QAYH5a0dLQ7b!S@jRaoy5@<9;PayP#u?LSt8) zLYFmgGr=q9fiIzvH&m!-W1VM;15*I;2T72s zr9`c)RPB)rrX(QJV-<0W=bob@h;&s#CkXa`ww^!yzgy&(S-h?5;RwURF^YW}Qc{SC zrRWnZivFYs>%n+;(nGB%4)Nwi!bW3yQVH0I+Jn`*FRjw?0dS%Klq;Q%84OaeTQ>ms zKnS|k5PA5S{(M)e9)c1CJG%G4gAn)>!SZp*pFie0rBIw2QPL@AY|zJb9V zj&<`Aosbi3(O_L&dpXQaI;Y%_lPoU-ZgE#!7fdfN=4T)?oN>>0OToT&Nhy$JRX7hV z0_Q)86*-L2L`3Eu+Q%h6Q^Z*M#?aP1tbtk-PDyu`vk``DHDfbWz9k7 z%hT}3H-ZCTGv7jRQgB_biYwI;RM+1(kdXxNjzYUi$0@&FJppN6txQ!b0|Dc4EIoZ1 z70Dt^rNYzFx9ht0 zplouy2t@tAE+OVLWI!Z_ck+&aHS%=)AVv@R>3bi|WK`PWww_@Ea%-I;z zG4oSN>w#1C7OOPB>2P0$whMWnB1=?FwWY!{{>X8g*)m0$(0XQ@rmz7vb zOR#!fP~eB|AK#f1M#AclklUAP`IedTn^~}{>32^4hzhO15P51b4stPS?$K$pT+Ch) zL@%AMmnU}cqAH2Toa>uL5mE5Mqk-LMW!*V^WXFV((2BkKC!(9X;ZNnDx6C#kGvtvud@Eg(ywvT3zL_Z&TOYX zvb%h(Kh%D2FP$`^RSS6RgrI=0n}qJl;7C}`m+Ybk6>MQUw*a-Vvf*k{GW=9ld!dGL z_bo{P<#bG&<=`VxvdnfQtvAgXUB9T^;zvv+YuX&n^N>50q{uwCpNPitN%bghP@I=6>SiN2ar)@0-Vt{2r<%o2);wek?B$3jtU#EN|``gi< zAX!`S5E%q;SI?#ACUagho-+%Ia0fw!VldzlW?RyYiZ6rkWMD)TwNV_X)pWGuYSPh% z0VF>S4?!$Of_7EGx{NO%VfjdTr@9|FUWW)|X?^v?aK?S^#uc5}7XB=<@H6;6xdfky z6u6=wLr0Xn0pL@5CeVB`>;fb~1%Q+9M+ZkGg5L%Mj?g_mk&2s-#j$x4twz!r3C+i6 zIiPzH7#G>-$G0c0=z2-8z}ppqb{Mr{QIXLuX2>XO*hH3ax`f@VRl(7uPmbhspXAiGf+2c<>t{kYX)u+!}dzy@5Cex_`LfI=dNk00Uk6HCiv zW?7~-kYy_CxvI`SQ=Z-G| zG7yVvXV(oC{PZJRyPoQ(1fc+t1O!V|6Bz%#=VJ7>Vu7&!`DEBOF0UF+s1G7HRY233 z6Ih>4VpH~PDQmBY;9|2Pg$4m1E%fr=y#`R|D;TwLYx5@uts#)J-v|xeSRp;aHK0)8Oe6q;Rb>?NRKMo;YtVBRJi(kKAF%C?9khC zXdgRL$Ve0lHkQ&s?F!XO=vnp}?pl~GY^Yno7)d#DBvtP~b=R%*hee;jCy#pMJuWU* zglXektDlqFNQd3+K}ycWw?wqx@!M5ePL2 z{B%D1otu}JmupYh{Vv#y0w78`aCI36l@62H_*~Xjz(W_5*~>@$I%pIx&o*vKW87qv z3%v+rhXcsY<8Yz7via8uvh%&NQ((qXM=}{x!RzBeB-h0fAc@%MyUy)Y+?)tWYFjHj zZ)T?+?SDS_|Bzd)C6zEVWHrsXW3;+OP|(%<{QSkDw?|{HU&8Cqde=2?)X#=<>(;I7 z9A4j_<%xT@ZmuZp2j~=~RmTPwWp2gml;CkI6qw^#A&+UJ*20 z>~Yqr-){z^492@1uVRg6S_kI1d0>(E|LT2Z@!$L7bF#5huBOefd|+|8t(L!7T4U;b zK z^5y&=LVE8QkE+l~XYJ;~-!4HP+AA*0xMkhC#ThfdzwdHC@A0V?wF$-Tr~ky8gS zo%o}bSqSO~1jL0I%FYro*}{p7cCM;NBDff$9`iRYxn{P(=LHt*L*G9M%=os*P$_sD#PtNIo3w(w8fU^jmX8X_=!XQM?W#>b90m( zLic@*M?3p4;zLcpBbJtr1Pf!tlr3g186jsxoOGw7(Xbb8_`=G7r_*u%3n!J=8G-e% z8qf6FxP2#x=R+Y|2?q~E-$Nm0;*u+mQ7VFI>M{r8 z{Sj|A(ao6r`~eUa0X%Tu#z?42BoxHChV zY`!j_-a`*{#06gb{{hcyj1dNma4S*0t7p<(kUg!DqU%N{Xg*$hJ%tt zooPmvS40B7H$vx43}P^b_#^$9gdXWP1^m*6PD^%zrOFP@^0oichO!5OvK^rFK(Lv9 zyHy-#1KJVOq{gNtG;Ch-pyQy@91mZXvgY~-TqavcR&zhpLH{E-8+7OxwKUyojOfyh z7d|t%Z+H4()rGvU6FxZ46e5JVe!*r{JK!g>=$D3nY&rJr;t4k#CZsE)(A_Ygj0{ZzOUZ2<|LRaipH7-EwcL-(_m6 zeIFDRvg1jkA9hWjDIx0psoXtU_%Rpe@LDitXHM#c-Ft%MglCy3arD}^? zHdkdDrK*hVu4mz0AP68I>`9z@S1>ZV!6Eq+5Wi zkHG^$8eOt5y6OY0r>M0;SaTcpkpqB`wTF+Bv-pj*;>XuIetj8=XzbC*<$3?ujxTaU8#OSWPTQx5KYUV3F!{Y#m)X>24OXhD6d-p@ z?;ff<7&^mDuB(qvr~jO%UUmW?>fsh|kF|gAqJR2okTi;8Q!YzW{1|||aZW@&-A258 z(aty1Vby?8kA|g1p7e&MCiCJ3%I9ndsT_iZE#s1o8%6^6z0aeO`l>PlIQ9U{_VUn; zwn8-_^od(MayWZaEy;e!)){voh}x(72xQYFx}!0Y7>^K!_Igq-0*A?(?ddINeWVKs zTJe6n4#FL8ssPR@N!E8uF(?f;&F#0J_}_zw?OuZ~;)3AUDg2RfWGCQ5M(vBrGV-}; z5?%B0q6Iqt{850$gU!$Ehi~`xnj94cl9<++tX5SOtFS5<0$p`K9I*?~N2FgUQh|iC z*xu7Do*zkc9dgs4TfUu5x+}4QjR>N|GpMI|po_ z-WNP0vhX7!9OhAxMpg^7MtyucJn92EJ_Og(K=@>OqRf9+fUgjoggj_=fVs9eg~Y|R zbHBGN6v(-`_I+&x&7#%KBKr+*+INK6#y}Jp*C^B);}1g97m#`RnkjIuB^C~GU#F&e zKR#9D z|3ccUgA@=Ua#q@oViP(Jpm^m^@ndx)%pNbvk+Ocn|1ku1III2y<2?ZI{3l;fhcrX; z8UQw`mam86t0G}k8g#eqm(wMPBw1UyfxJDcm0t)`x(E4W)y25^>*Jz5yQN}jcM`S#Rul9`T zkC1yu?%z=#8fX3=FC^-GOE%LEKVPs;4(UwVn{uVN7?h1NUuFrrtAac;=M9U;*%xY7 zN=4g=sULyj^ZcxJvGN&g=5FfQ85-T8S%;8%J@7?G4*qfrqyR>VB-cy88-Rbi&Txh- zJqsHI$y8*mu{|I+?s|F}%eG)gbNl{jEZgdW)ojh}DTFF}9bmue^+wv*G!cJPOI#vO zGy*$mTj!>UbUhE$u|GhvGCJ(5FFDhbR|(C{--V^sw=BflOr-6di}bUh7FD+SQqSz8 zBy$1E#x(DBzuX)Do6q~_r5+!_Ap)4oQT%yH!73mTG)@+U6(})5fcQ|^d-RzONuSH< z4!s0wOB+z$8p9RiwAA7o{VZ3e6HNfjB~;x?a5&fxb)NJZAJ-Yr**=Z?$1AZ>H9|Ds zcE3aR+{{zOuURHMPeJUWDjv}l+VrJa^F=pSLfBbyLlp@@=XK0R2oK1um1!PszlD)@ z)4vhp;C~QfLs%@M-Nca_)w3C+n8p9rrlrp3y`NE@SHX{KeQj;+t|Xef$jMp-hJk3Z zZ?NnvR3AlfraadD0j-Ceq>2LPRjm#+$>zP}dGbQqD<%(rA~$Ha3YOJ2D{HEPODK4c zORlisuOT9@Ld$xZ*Sa1|MHRlBP!0vp%Vs|1&82=pWFwok?rE=l$bOh4R^ox?Ys{Pc zp+LW!OShU78Ib;7T&hS&CAen}Qq2Qr>#7H*mxvGfbef3e)?xdVHT+soTs$PcSNQdO zOgLb3q{vJAiUgT3RSv#A`irX-wu%nE>-g}dEOtmof9{RV;%=%qHnFma&wO~3WEaB* z`6(;5Ui}-rG%toeQe+(X&~q0_qs@6h)FrS;4G=sv|s-S!nY3K*X!#BL|gzO zWg~ER-;xlax?%E`g3({RPJ#zHqh7%UJXof&G?6flmw0V7kS{xdWgm}Xc~>c zQeFh=nfOl+ff%yfeWy=zl!SHqZ9CDpMP6Ci>sjeyKv**DexrUu`}S^|`5|SzqJ;^q z)swsI1y&GZRDAFV2XoE!09)Oy5K$5 zu0~uPA$hgef>ExwuGWs50I(*H`KIr5wpAm(P08&G$L)~CJXY5wIFr_gd3#>o@VB)-(e%CBmwE=KeH&Oz`Blf8ohPqaRQbH*Om~W^XT_`+#TD1bJ&TRXVw+JC>SamdtAll3o%6<^FLZ0 z38k7(Pc33%U?ReB)`V%!oDB{K_DUT_^^B8NDKtsR^|&oIVV3cBgYBE29z;Tp;p1m{ za7Z&1m)FKQW`M#AF}7wF*hM`-=YcH&O9zqCM}Ly)h>J>q#e~?6LO(ok`b!mF*|o?3 zp-0b*M+C$f4~%E83Flj47u{rmv&k8suLM{gzWF}`mKk@$3kR`x0Z*HMSQsxeGjsbx z4AN2!N;1c@7uA&Rnc_jM<8}q_u=8)2rL>C`W)(g$5`via4SKYECWxwdidjIK`Y9jua(@YV|9lAf z>nD6gpkSWEMhJTn;{~Kq;q6V?=(CJK8ZA2nY9Tlkc;%+ zX%+!|>~xRT1Ty8??;t-fBzvic{HJa6XNCfyq+To?n=<#GKXsctlZV9i(YVd@7i~yB_;*`NO{vGIn7A<$lBK3=4!6_g%tQ^4RPX9?l1_%UT<)8-FR9 zHJbPKCnhH+JAPrym>D^hWS=Cs>p2SB^ny%UH)K2)ogemN_q*;s%?RWdG^$ZY4Sf4# zK1=46Bhx!3kCr`@fj$j$2s-2#-QKQsC?49R%6JWo08mx8|DdY>#R|y&|7qad&3qmW zX|5oZ1#sT$BVEq(A(+teL^Vz{LmxaW{J((6FU-bCcn$`A63!3dm(1Df2W&Q)TBxG< z#zeHwd220Zb}9WY{0V!gtDEO71WjzfF@$(njhb?wksK48SkLZD(q}lZf=_FAbeXZa z5&A_)!c6U>)0d65a&mI^?t)n0JXr*@CQ&`$Wv{!n*Xulz;X_r~Lq{v=SX=*o!+%dS zzt@gm3(^603^;UaH^_iD>7<8#;n*+nn^iPs7XEXMVxnKuk+xFs?%npcAD@_Rg|KFE zQrcVZb%Fl~oJ~DgJ&q$fP|L2(-$s)<%7osAQSJh9Bhu|*7n!Z2V>wRJvU6{327!`A zkl4;;)(|S+#wfVj0}e@fIG?oG|DbEur3ZqtOMsLI$_wU`aG3E_^NXt4H%^drXCP%Z zqW@e9!~=U)&w43-KQ60t+_(rUvTRccwyyzN;NE92_&)vi zhSlI}MZ8KN0OB8)5;QX&z!>{+4e6`UGcfoezAZ@JFh#y}$xd4ROBrkN9MmF*l0*fx zKcSAgSk|$IdKo$V96mJ^r1J;p_TLWzqXRaC9fFgX)Avp}W)M`GvitKQwP&cmF59CW zckd5EXXXDNp)(!^tFX_S1jIF|ds34Alf)V0oMduwmAyBHK*5kypoGbb;k%TFGbSxWpI zQbQBn_2sx<5wu)BS*L>azgS{_%R5dH%Y$bwaTatg z+L5aXjEfW)l1gzBPSbx95iMSLhV~(8h_kCQ(U(BT)?R%2JK-yE6)1cPdqjISx;io+ zR9vLFlkUW@a;#3!91#Si#MU{mCbu*Cx?WYX1_>W+6JBZb8aGZ4p-gXHpU_0Auh9DK z9goOs`!z^?6lZif_$fF&Edpv*y7y_8dx>Vq+Z9~5JI_m26KHel}V&KvB zS*N#58aX z%mM9)lu*Z+A)=blheKZ)>K?d2#vf}~8V;qHpk!sbks8zO_ZHw16=@Y-vWXPkTwASr zi^ftglIcG0)~yNz=Q!%xb$wgz_Oz-qaxd@5z8!UjgMAglrLMTNkDxI=O2``4mIwPI zS&eU<)6lKNh+o$F-nWCboyEt0vk6}!zSNt)WFS4HfZ^sd6x~?8RU3J`KHjFftO46@QAqI zGBb>DdZIzqTW?vqvrTTY0CI9b0y$JCMAWJP3M!@Lx-Gfyh!&8N0XLDg+I%X8AMmyY zH!y@*@1M6Wn^)&)4YgOYJG6gM!AtiSeRJGp-vpqgvD@=!;>omCIv1oSQKDXmC8*yb zMY-HCX0A}*IjHiq+;)g}K>4^!f%bUEAamAq+D)Uf%iZj*erp6K%sn>=g;pUTinHsg zJAw)4AXEG8r*>z5r(Ve9$gmL1KE}bwLUt07;tILQ_TN^j>he{`raA^1*P*HDaMFXd7wSiz#75$ZBIW^iK42y|62N7212Z_~;gx&}vvKh!2$wQt?ZWmr;m zdRS?XbrqNPfJ($Ivmb2l`!8C8kkkn8_a59m9z~Gs-xBzC8j*@RPz&=Ki-r6$pnhkf zT6Ez#-C_?GkMj5als#Yv3JI6(QeQ7*G>+oc0SAVChj_o)e z6-79C5cXkk2LuEY3jn+yChN{23xnraNyOp)E9W9yI6zN6Te0MS??X|{WALrJ4vL9YrJv)SjQGhTlY6=Z&nP_{Sh_w z6nv+RYtcPN#5ngcUrtv>;(`90un>x1jKZ8$S(RyP#5N;w8SO&Vz@$a#UCVBgR$ZqW zX3zAk?)7v#X=9^aBS8I*{)UHH?JU*W=mmqwrkQzKIL^$-HnYM+lARv6F52}%>SpDY zPrfnqoN)lD;v&Uj$aoLvB?{MhjvjS13>_QIn+oIHPC`G5HhPPlf-C!X*iZ}1AjZO6;Xn1u@oByTu)ol{-x7p@d*Tq z+o;DYhxIx&l%-trJYsbnw`lykgEP5`D843kKTk73$ItbMWZW5@w$23IaPBM{Doe(m z4OK5};#jj$qkKEggM1u`&t*Gq-=a(|^{_ab;tjb8u{0U(okiI*$L$@C$g*Y^%2&25 zeex}A=K;}SEJAX~qU@U-Tlf*9?Q!@9jRQ<8Qk1zu_|p6X0#+7s*inWVtUBpR#YaOs zZ=cS_N_4^kP0QH<6BV`If)jT`Jf9Cus6K&^^2FriXkLgy%QusuebiCXcG6=mW*8?r zZu{p7JR9?p4!mbYGeKen5wS8L&YO_h-~UF~AtY_CxJqDirM`f*XXXakYrB_(Kw@qu zmmrzXH2q5KSS>PJ#?oK2>cHNTR~t<}J1pt*KF8RNu4G@N8h-qsu5G3=Zy6$J{K>T7 zg<`Bi4<9Z6sT1#G+XS_i-Yj8$^?t-G^An97vQZQ(nj8mjRwk;wwGM%IO97rGr3NX!y9?*3Ar$jy3N=qb*~wfG^ue zNck65L0+?sstYixo|p96U-PEvhN`IT7eTj$##bosJ=NpgHeepgnMA^O;MND3FGO}4 zL_lcZ3xYm~<41DHA{Ie82zSl}uxUQCf2_{Sa;H2}4Ft5JHr7yWg%P7znmtfMoI7C| z9d<#C^%?3;n8Yu=5?%S1d%@rhhcWf9fHqCp>Uih33l^=`?fy+gu`P*ZG+~qveCg1g z7%y$~QQ~OeMB18Oc^jN1PEsc~b%03Kaw^0zsx3L0=cxGnS!TsUQh<)ugjJnw@k05I@|Wx$zN6Ya3(>!LQ>Zbh zW2K0(IzZy!uy!Q*Sg^Tt#Rj&5*s%FP&Q4gdJMJw!JiYcadC(70FH4iCaeWSdu&m+1 zk2q>Fze6~VqHAeWLTKKOu-D13MNBF$yvJ9=?l&1XZ0i!OG|&6U6f278R%w~L3ID9p zJhy@XbZ)N*;pHFnEdJ=f4;z0tiM}EQdL~-h_up(Z83f_3K#T*UA7c9^M>6titw}m! z-MPF5(Am(foDR6$W(y0n>D-SjyHU>j~4TEzyB^b0P)tTj#tRYJ@E7n=@YaZKh z7~PFrj!;XAaC=4|z&<#3ikKR)M1&x&=JgHDdDr0{88VJ6lH|i-#e>PXGU3zcn)~&? zm^g`WU*FUHm$PLUw;bL<+0VX%SbaJ6d3ViLhxv&38BTx3jQ5<{&+(ikNHa3{B-(gx zN*+Y%&W2hT*3|ue@8N;g@t~HY)5?z36$cAw7nzRnEsZZK>{3*&dIGz0*>=CjCBNk77ui zz-LxJ2>F#_$JgV57^Nrc$Ir_CjK^K070foN-NR~i3DD5Iiu z35+SbSGZ9xqa7Cam(-5mI@_zK%s#rOt0aJxzo2Z-lTJF5hU}g3Ky*da6~goCr}B$y zaR#Q~9BF}-E}$b*$CvfNyMiz`Zcd>gA79p(cb3X#2^$O{1r(mb(5)D685mSl5H?(3 z-3SYnrG=qVAv1`uq)8}b3IyKcd$&*TG~s!<9h5V^9UslNS_JlscKvi5_hGGnCv4Gr zr6(XOD#iDSZ%*q`WizfqtyQIlzBx~ImCc%q&rLz}6z0o%4IdG_mD85{g;+Y}{r>g3rzq$Ynh0|)7x^i!nYSZ%KWAGDHQA&>$tD#( z>n62vu7O9P(wtXkD{7sr0wN^qpvK`QsZlr@ioEBJPTSR$yabHsq6=P>#lFG8M(d`a z{3`C1!9o8~PcoZ!_EsU?Pa;hsaS~bU=~_oxS@6>d@rB0ucs=}4(Q=15gZLop zn_R-%szN(DSEJdcHTl#YDmQb(;{^X>NC(!HyUJo*;MllPh61-%CVyDrgWJYPG*F^;&Jz-(CZ)~V zeSjc<3RPiTJgpB)AWKuDU&Yre;wQS*iv(AZPStMfvF!{I9?KS;-zm4AWBvxn%=-7| zUCvIw>pWFnUVhP{I80VPck+jEB1CDO7Cz|Kag}5zX;y9@W7p2OX3v?s4e-OjKt)kU zZhu{d3hG2x+HAYFocq)Z4S1=d#RlGOBx|_Ay?YQZBq@dMl<$vC`>;C$DAp02XSJ^B#+&%mMM!UX%@`Orvrs_{7*b)HH{X<~1f*VR%e6Db%i8jYlrw;#VwY8| zL>NOnHZSK8D?YY3&fccHrpH!T8WQ%xK}tn)GV(kf%F@*lo1&GK#r&K8`NNjXKAp27 zZPzNraZ2d!QV)P}VP{0MAD?T|F`$Gpn+ojcc?;`qC@*24?gi{jK56tE)`1FApMp z%qEQQW_-s@lp42}Mx5&wGOOnb)Uvqhv{;mOuNq|Nf;)}CUY%MG(f-PN=`&By&DIzq zbzCm4?a%WJCBiW5*hJgtAV?HFL<>cNy$oDa7h8R)*C0v{fC^FSuQ{m>u0rC9^&ytM z0Bs@DzsAp1@;CYR$1l3+5}xju3pX0qy@G9!kF@f@8(WBnMUm?s+>#b=m<{Ei5Q)Wk zo+!MD-gR_A5$!S^e5V(CHi%r2EF^w_v4?Z!{P6?+zPrx2m19y-^pNFi;=et+$7P?l z>nbA4|HNIkf2QeUj@tzSq-E?L#n1+iiLTGI$HZrVkmr+I=ltRd{)>-yU5A_LteSwV zl}eI2_OUdgY-96$I4^WA$Pd9H@^1?9e;I?=8>t?!AGWv8?p()89u(JiWK3Q}^apGO zU=*dNGm`R~9OQF+{r&yDpS4u|7gzE78e6?x**wlx#4)T-n6RU%Ux9__aK4akJ~EQl zEpL#9*T=C*jisjYCLBL5+^c6yEP^4zhH=X!hJXH)&vMLZw_A4b)6Vp*=D2MqLI%3G z`oAw$G8zcKd^)GJ%M3RI_ojvPpoD*{|C@)s5{1WSbxXP6jIWX8i-i;GP(?8{wV^PV z=65AUZ-ZU>^4ASh-?n-4(5yYbYVWe>WjCAW`%7-WW}$dv!7av{>>Fs8p1-Plaqm)9 zT6Xr;fn4ja(NlHnAGj!b{^Tu=WwkFhY~Z5j+H#%x!G;YYd6QM!KB(BLl$C2>xzzMLslAibe2M(e2{O2qGe#-yXNBOT``3#h@>p#Es_7?pJccL9m+ASmN zl0}gp$_Gh;$&0#iX#8Rz#ijGMoa1(Cxtx%03XJe1^QJgV6q3x^Q)bVgr!T*+DTJo?1TpG*N?dq=Qy#P8mMhFA{rz_#Qx%5BVQCmuW-IS!ll**8uxSZd#SOJ+AAD9kHq zmX}U-HF+lMk0LXhguNihtK#VAiwpS((l9o{R?JGmq_Gu`=7+kL}HLJX)Y3{DC3O{kQ8Z3Jkv0D))VtMp3W^hv;jqute{+$ z*p(gwjT0&^Kh%PuQnU?Z^!TyIi{y29I}dH-ZnJHj61?WX`@!`&2IN=--L%IMNB{>c z%Xum!pCC1r-!d6O&qF#o;rz;ReGcQ1tM+uo_$taN$Y?f_{2>jrKoHbEC&U^T8Yy|N zDUr63N_#fCAi

    3pO$(# z4&AQ+9gCO^L)RNuCGQZ*0UMv${|2@8dVm7jGbcUy@+JmiPV$>h;b?J7O<7nLNnq@t zLWMRqd_Z!hmP_MPyMdk)Qyya9gUORf}g%|?z7qM z=wCnmu;(h1$A=bYF?yt)dp^ZWiZipond_sB;|t}G^L{Ol4JQt8=$eBIEkFlQF`EjB zaklDOWG0c-M__oowP=l3k@`*eh@L&EOfvhMgBBj%7_VlK_B%OnF zJr>xXqu2Ws+HPSHk6ZP7Xa6sDwpJbX4)mGxn*dupZ)@@W)8!)DTv)G8?B7IEBhDnm zK;1du8!@pLo(UFPW%J84{@5-#FeBzpluk`}hhAv8YSC!dQ4n=`ElYBDT_gH~m~XX# zpz^f0n}LL{6|Chbf%Nw6AiHC=pU5EjJReX}+C_et z&s8w4eA+8IOx1sc>FWg#E@X&g+|7>11x0904nI>twlCgfvVyJIU|G}@@I;t236lpY zikJ^$B`?;<>C3#7xeEc3=>%~CN?-UUO89sN3=Fr z&O{Gc*Zvi!#&{cCZ~`dnE&yUBJHyt^*7?A#xVt}c>8xu7R`!Nw(Nfpl&o7i5Cjn6n z16`iimHDZ<^Q&DkK-|@ES;~o7?piV*Gd4t{MyD)QE%!U@_9i;{B{8~Qu~?&DJp@v3 zC^W*YAS}3gK`;L)?e)fm^F0-SN57BCy71{B9ouxcfQ|a62rQR@1`5-(e#>b&I&QPz z2rfOl#oH;qyhrKMjq?jV+7@M}zj^ZXSfk9k$?5N==_1<#J21jMRCc_X+OWR7 z30QvgOc22881YfCwDu9*7Oe?5)S#c(kw+aUF+@+S&lfVy3$1ex*B{ooS+VjZbI?!J zfO%&8nzqUay46tSC0Y*aI`WngAZlZF(E)(1*8u*p=azSk7oqYpiU{Cwi=|Uy2CRCk z#V4)5Da}Va5;|?z-|Vw?mnpXfntRwRwrc~RmTJ>}Q?X)Pvdq&f>gwV7uv2bo#5(dT z|JGzzW@;*OB)?h05ttnv{HD%N1rurM`?9J}`_Rxg9)RfN%yif_ z@_zHd(%NA#3-)yYrcRdh0i?)eeV)>Ly!pCX7=wH^+u^e>AcaFD`Ro4fN1#loLi<|U zTa;c2f zR5&c_WQ_CqS9-X)!N?icY-1)?WZr*Wu78`;VF)zID!B%9TF)IpYs#`e=*JlTQ{VJQ z1--WaqXU(K)1gCO-mh2cl89lkfqv_>LRLNkHuxDYk8Ko8fz_cqneE@K$&}}3CVcG} zR1*ItmVD$0W40FXac07!UsdkoT_%^t9G2*@b7tcGNNLMhPqYDUjuLPkl?&_qfloA!e9YgXA#mSEm{LErHQk1j&~)A~ zH9^k7zXA@V@*w~{mzWA{O5p6VPJS%d&^^72FVD{_72r7fb}zE;+1K1F>@vfyvmY31 z?!TEe_QJ5zNj^8d-_JDc76}=4B4jGeYmAE<)*$x?6QcF4p&?g6}qtnE9oqH9*|0lr#?ao|%z8OA&*ymt$+r@)S1=!+JmCWndupN)pu6S3E&J zgwX&S%KTJC%=?b`gf)Br@nYr5B=mTLk@Sy@%xzQ_-~H&5oOkW&Bwfwy?&Iti)S*kW zu4b~c=cmQq9sozM9xMYXCF5QF&Yk;k|6ojTdiy=YwZ@hG6b9GK{37ViN3OuOy8{E0=;5N*?x#M+v3#o zOhX1Z?p7|@1aoD2WDv$o`{IdoX$k#4@Fcd9vX_NUgiZy40h<&5PRihM;wL8+pyx>H z7g+a}+e*2P=Ia51IJ#~19IW9Udr4(O-?0$-E0$cqVK?_3ak#D%78JW2P##J;4gHPp zeg~4`wy##-*Hs3$yr{eHxchGQH3bM74`LXh>G@n{DB-SJIX;GaDPUm&2kT6^;BPcM zeNHWl1LUI6B!y;T(jkVtz3}skn^_oq@d-_Lw^QQmaMf@zua^WE&%VftZ3iLq@UHQ( zU>oE)rwCVzi|5e3-C8^3d@lC`m!_@kYKO%zH0L;fg2KHCakF=fOU>97Q;!BGIQASl zxRI56`O+eOku2SPrgZ5x^grcyAMJ=+=re8VFDd88t^`YXY(8v=d*GCLF;Xx!modli zvlYRs*Se#o$XqYeM9!m`dWDON>uqtR#Ag2sSB)G{xnFgW0&X86*iM?IxFHn!lND)F zhGVl{+_`;_0&%!S|05C(sXR5g$97XnY(P6e+S#sok1P8=JgI{8P8L{vKa3B( z<#$D7-%<3fr_g#fX9X2tbcue)7QmaSnrW|cWeUF5hN>28zZ&^TpsoBiEBP1bLqv8? zJteDytZv$F@Fcxns}Y2eewfE}h+1AorO&B-5#)`7kB);Ty_vOl1 z2ib}=sWU?I35CUFmVl9?<$J zGP+3z>=nIJ@Y->7@5T5VH*vP&1YZpVJLJz4kxP%ikjr;2IRx~?smPK~HdI`_MrlmD zy_6OtaPL_xC>Skk8EGi}Zr$RYyu4Zf%sq=|hJr_(didTY^Y13>%AHQW`24=vxV&0E zrlj>=IByk=XZBj&%mQc$O``q%@(pLW3zd$O)9vQ!A`IV%fMW0ZYUw&|qP$kY!J$Z7 zqk+Qf=iB)_1GjPkAFEAQ=+m2E*BHqYg|$;TSN$Kjdf9RTz89yud!ewWICWb z-IYgpFb6QJK~GVQAJ4M}mR3-84V|wYhd9fFSE`*F;43fA^h?%{`MdtS8D+-jp&D`K z=9ICJyJ}bcSX06fPw9GbbbYJ+N{{;#sfQ{!yFGo?0-fSiC}$PDXLGEtpYd>+9X?cL zhST^`+F1?@d)~hyb4fOlS=w_wi@h^55 z!+NnC?y9we7_|(pDH3nPh6p_`-gWB`gbV?d)*OW-41?HOJpyiMqg zre?=m$BrcBk#D>^BN_O4$WKGuk_D}yc-UEvGO6MQ&4$lcsT8`KXD@V&sbAN_n44U% zy;GV+9VcYbXWEsu;UL4mq=ogx^H%MH_aUxqX9>t>U zr3rz~w%EqT@eQ;}T?H+)jwy#PCQG)WRtX}Qyz6sf*|%m53EfNpI>)D>2Z$g`9V*Y%w`dRGIO%|5CGVZ5tjXHY2Oa?mDGVP zHu#nfI~T|l!5-KLi#A%jXtHa93s$i~1eR9dG7YVK!&6>d4fAJd1!3sm%J)}#Yg~TS zjKf)RrW!nFG7^$Ih!$1roZL19>Jb$CHoX!OX@>**bY!0+6v+>sNs^FX#Ml7Qvoci;OrW6s`F5FmOAUTt>%;K<4J zqpD|iZExwNT<#K1>x&-4maHMdHgEeyXP_nXF1H|>Ocfadlwql z$p+VTYb9O}wG9Wk3(YH)oRp{crA&1XLOAN>F?}k+pU)*iau$=nzT$taNImu3eYa5BP-ek@fPO7Py zcBk7jCFy6g&fNwpf!AQgA@&WwB%nP-SWWITNGNky6#%0wHU1xc_Jk~O#(B73enjlb zkG^J-Ew49(JFo@-Mcl!`LFYvrv;yT~hfdx&r-L7O!Det|@7lX^uP{ONTyUHS_xl(h z7Yz_j75*C!&%MM^JPceXF+0aTp79r7(o6%SB6%+yi4{uyg7yX;47RCeMz4_LPlj`u zpP!GQ?PP~0sKllH?;76%yT4>c@cyGrPk;`!wQ8xK&to0C-3MAckkR{&@3UlhPtJ%@ zAOJJ>l;978O#H+tc!F6cJ6yVxywsRoethi!x0V}lAFO*U(^CvM8HKXdqL9n;bTk~K zL(zwKmypWb>M{pD?G3;u1x7oF7i1LkEsE9EESiW-PCtQ{w7~3VGS=Xv)9%4a)p^&E zu!BTnNu1b*n^ytRVsPoPNuaxQe^oUVMuMW{SyE6${OZ=G_cgyH-=as;=@8 zhypx~w^!dm?WW`pn6P^}E1tY_>)J;CJ63~P9Xs4x<~mS7fNOc`EF(Vh;>N5wu{&wn z3Ao|9J21mz!DdVL@kxF!hcVQ~BnD?+WzC-!Op=wmRRp37nareZ=_t}F`TFsVagMVF z&RMI1;j~c<4a_Z6zRENEg{=JAnwlO89!j*Kd8mTp5<2}QUhR!8u1Pj-?R7$L$K9P_ z6ZrI?8Uf{fZInyGh3Q}SI@1f0Csm*35CRA=+>bB$?9rK%7W#5CCBV;!Gw<>v%Q9`! zuL0)&eVfIDVi&lFc;g#%On`s?LT>Jc1o%!utg*9R$2@;RaRXLUSS)BMWA|9=33)(8&3YRu6I90#9&uk|6&QcUEv`bmD(HLCVPcrGUHw9bRGDup)xDx=f0UDg< zxU;<+Q}YR5<>z#!a-VU*j25vna+BlWjA9CF$CFGKxt^nHZgAY$uX6qFgY)l{pnzmL zDZRsXxvt4%dnrt|x-v{OH$8PX8_UO*0&-+o@bW7@>rEHf@0+;0RMHCShm480HYK~a zydU4^Z?N`yb>R@ctKR$(?_%`AwMR@jtM07oJ67?N`O1t3=uKOfBv-~*X@6cea?;hk zGppn;4pNTtwLYw^Zhw(Z7!I;*?+L|q?OiSJARN;& zy!-OK{zfM_=S@;1uHtL2?=^-!(U!-+7=?NhpQ!8ZFaW%D9}Ca~T|2)f$)3wvZ!9!?Pi=3r_)VH%qqa$+z;F146!b>G8 z&}|s-UEzj!z~`pfklBv%W7(JLsW(>c#Dr)cspD1-PzvSK<*NTEhF0F;cPH|~Amguo z?A%Vt6Dw190gX5X3ycn%}^zepToOQFDEphz5>D2xW5Uk}iwK(u8& zx#BmMMZ8sVE3FZOKq}bI{2bURd16KrCL52G2s0x`aWKiS`{ZQvo?7l>gi$mKvBadm z5?d6fXn&ec(Y-tQ@5Yk*$K zft+%Zz}Rg1yL6_A0e8y|_JA>h_u3TGnAo_FsbRUw%Ji6UGQ3>nwlroP{n6zG6QM5> zyIta!Iv@hGialW5?&Xi^aOSOdbu9Pv-QqFQOh4MJv~R)@IH+VD(j*&o$y1s{^x$c& z*k`BHl8z+co(f0@iLiPkIbla{U(g34^YO`0b87}$?6>fn&En#xMyXFc{*(~Q3ZsYA zgBkVh2W+?gXM=O0dk{Rl0p`S?ID}22&6SckeIFM6r*Qngcx$R_^9B39`RZBxLi&r@I4&ME(+L6mBs_k0~ zI9WdeH#H(h8?U?S#zqbG363`5a2Hlp?DqR93ZvJoQ?{<&HPG!T^wnxugDpe0e&zQ5 za&=*#4e5x2;7CoJH)!$n0kAX<&0U3DkTMTLTv{&m(R+j7)i(kVo>B=r&}BmbE|-{^ zPp~?=&Ev%xiWLxRKOz00%ob!Ixg};L#R>m5*GNi38Ti$`fFvbVdRv5e!OzN$88pI5 zW2^7~5p~|b-zX|Npw0ZRFW!|}l^7?v-`rH;hvRrAfssNE7v8(^UFw#UH_OJC~%t_?N( zEM45Avl?4{Wr*Zt9oQv(`cA9-!eMYWq>_@l1+`kj3o6mP@37eTO#`&(=b&1`H&3U5 zuJS?IVDr|WhJ4@oHyCMM%PKn0SW57L<=R|t9XK%{cCcxuYylOAf!lx`==ZhAr9);F zE4SX3v+)d&#VK?=(EgKOu!j;hU^}aB4ud%Le=t}6C*X=?(q@PJr4bn?lFu+4eQiRXaDhtv53M4tr5NS=P&x(k2^51?;c>8dyRIE znpq)Ew$z>N$k{tpEoDV{@SlkmQ0%Vkm}dF~(?W`615pGl>ghnhKZw%JToKfX1pNLU z7#K`q-?){7fS(f5&3-{(sU#hASN-`lO~1i?7rag9NWFF+yx|xG;bwNHj)FNUi|Sic z4}N^LCT#sgP{5^OOZ*CtY%mHpyX-vIBfPze^^M(2`(N6zenfVxKd)J{!AGZow@P-? z2bPToe%SM{M8Vb}V*$uJQ+hLGEEPh=I3Q%qnN{@+d}{As+oJvidQff5X${~|9feFU z)nmwy^}k>FKi=x!?=ojP3GDF0Zu*N0jXz(9sN7pHdxK;9YGQq=UPZZSU#Vzn@e?lI zYuV>OhZ#MVWw8OdM(ik8c(cU;Hy{+#iueOo9R`nn-D z>}BB!>#uOFKzcLJ=a8ZkoTUxRnLvOZ)KHuioU7TUou^O;!3fO7d&T8QEWBt7;BNKh{wM>?%iKt>5wi2}{Sz56pb`mz(6W=}71KXP_+2?0Bvws(PGWdb?1Q;$ zvY5+COH3_Th7Lt;nqGogn4v2V{WFkfN|?8DP5Jlr1FL(jaK6*+0VPa>nx*!yDRxd2 zVHJRN-_rA@1rXt+v^1r?7XY(OEG_LxOPSM4o-%TD;c`ym13Pg!mM=5vE{;`K{<;Hy z`=>w96S=tw*xVH&{fMQrHUMcqhuI~cbZ3hF;Tz~Dwgk6h^w_;tFqA3+KE+^0%GVVA zCeEKolQLj_4MM>pg!rd_6H~a(67rxDY>;?SS{OK%ekdH@7l5TpEC6kd;PVsaXB(srOST+b;0={PU6f?{9%rG*r2fgQW-fgiV@{ zjl;%OM*}om7fpck(c{wl72{~neIJ&6R&OcT?d@*gE<>EMf z;3N%Vj?XVHqviXwA(LD1dCy2tDjXcY(22#3>7VzYq?wSvEp+J1i%YBy&=a~}p)oh= za!DR#pF99|*Rios{3&Z2scJ$M7N|yBD>?Fuqd%n(;bg34h2)mUhVMAb zqjK*#IiqGFc)r<@HRhFP%+v}oscXy=H5>5ZKRXv;9IoPuFW_x2*CC5Jq(nM|(F6Ah zrG~vTbWYHfY^q;p_l=Qk-N_G=Mh$pzDzh9(-Y^}9y~C$*)1Hc5{G}(pd2-%RHfEU) zr{#LGwl?EhaPd7aKI`l!3fuSil=dd;y1KfddQk{RmnMj*y{?KHko$D-3fpuct@&e7 zHI=HokXk=bTw%ZYOvCCo$NO%7@tDnm%VY(jG`xll)$ue*fKD~xF_-aIi4tjQFFUtB zkx@nQ$}>N~S?#%!_fl0~g59{1Z&+3*#D71E9`aPCiGEh&21+g71;>!QaZeTL2Fvr~ zsgt>he&Z$JsgS|1nZ+UmaYOTdPDDbnfVf^-eEeITp(eS*nDmwk*Nt%6xm{C_*6Be! zdC1Ystee89Xw>AedwW4Qe3U1P<1Oq9_Aq~5WK<5|YJ=#{LhMR6UdSxc2|wUXJMGQp z7_+#tqLXW8aS)3$?y7m(rc7>SaECCsW3P#+*sZ@-`d!%Q6d3(QU!7hER;)U38@e1jfPV@06S*93zR|~Z3czf#6xMY z=@v8YeYf2{b!B|MF}C{xJANPW3}4wobD{H6S8*vg9Nz?mm|c<$Y#R2rCjsAbdHdOl z<pgkyrd_8q2SIt(Uo=Oh*MSK-=;va%_L3i#;Qmx-%5ij6w=nDe%|^F^(~;Lmkgi}2 z=b_qnFJirXlfaSdR7Ub8Z|WAR@ZBz1Fi(ylTbe$BIJtj6f&Te?0ze#UQih3Y4daO{ z!iwBa;8;XTRYH~&m4=*L{&#w#JED3_m{Z33@93Gnf5=a-bDA!yWZddT@peuEaS%X@0EA4`;3MVeok} z6&1LCM|){)p6`pChvG`f_i19rT_%hg@K1R%`~-ZXGc6cOn`ixq_c41r2gQ?FWgO&! zn*(BrU}tK;Gj%gQFfO+b0M}0?eefj8Elq zdmRZT9iJk;)vJGLDlM(X>89h(3G=R$EHD$V>-pdPq3WN)=-J%VZo0bSl#xd2`ienG zOtl7;E=ko55wvH38S;vVQ^wZ40F|zrf`@9C*{Yn&%;l7oY_k;W9ATiD9yri>CBS&1J%rC#tz8@a&MY0QRwT6bdW2A4d7ScnZv_ z=f6YlCSX#kLb!OB9u~rep@C<^O+0*On z_BsU+ENP=%MH=XCvrt|-l2I}}mzS{ZIIt|VBmfZYCz^|RI)^4talI`3N{<7GG_zG4T)!i9z?6bqi;)S zgZ!Kf%Lq#_nP90b2_WzKVGK%HY$A*JHVQ9loVT!3hHf8l0`i||YEr#~ZPK)*F2$}` z<3h#}rDn0HXZiVQ-fk#Rw9q_u=PU=)hysa`!3OS*fYaW- z+rhB#V+bg)u+>dbth-9#wAV&U)M6ZF868OwH_fEqNOs9x%c+CIwaCW34{L@m=m_+i zEP^?~axc&}Bz_E#5DPPmvudO7hQU9}i+aBvOQyKb^!UE!Xrr)7s+cwmS-HDgMwW?4 znVQiZM(lQcnU`)KSj{+c;2aykOTQWz*mwL0uGrd6pq5XX6BM$yc>uYozRWgjrko7# zizMeZc`>kipt$;^yB|&1MFXtkH3YCq?k!xj-8H@tW~^rErRcT z1VbIqlq+1egeRewjCQRB< zO~J$AH~n5eHyl?QWmCTQF^wPGhoYoBX9{UOY1qexr&|`mcpWyarz{80q3DFq5=Ne! zmgk$kGZD>OArLq!@a3lrpjSQ?nDBAxpg3Oxr1RdX@s^aQ0;j`E(z)w%HRkH{)|-3V zftz;_xPI_=j*^ag`687x2wCpMh1KU?hw$VpmgheOCir1%do#I}!_P%g=k_>vsZ3=S zh_FgO5qKEUIikdWy#C{vK~jyXVrm?j`j z`~+#-Fw*c7#}{ERnJ_1K!|yHrF+R-e+6QZe=P320`P2w)S$7Bj@1`EI5k~2gvj|S4xPS<*;-Q z{8mUiN87tZzve6f`M7<$h^2n5$STRAA08l-gV?q(kp&qXE{=#?JH@BSXV|S z`RIN&r4A!sd}v9Pz<7FG%uv`ISmBn*+kz26dD>f`NR?>jfg~y$rqhZGxzJzO8 z0|V7i`n0wynt?%xZAlbm?YaN}bk9^)h*Ebu@nqQAE`q=+Jc*L%Nt|n8%=p!lc#hPx zV*;E7{KqTwEsGD%ukZ5n+b0zHjMmEP`EoDxu0S*6;^(*XC)C1fnp&e;j7j$ttF&lb zN}&)gh8N55j40tw=rcjKJ9HWva&KPC%C%EO&oCLBC$I?|4Q|_&pU@5VHmx$RO?}Ae zgifi!Rp2{!f=1noNGs(Bb{~J2K4WOwPR+XuXwR0eylZPMdQ^#Ss3&_+pp; zz++W|1ffpqxG0tzFY?M_vXcB*ZYyebS5n7;HG5RyU3)pt=KKZWiLy|;b1$+kwM=qc z%s*VbyTp(71Mbx^furgj#ItfXE%6m``jTbNZ8!U7Jj?E7v!;&)yXT8LUF<%8FSTD> zTO+J5b&rS5lr5T;%cpiK!j3K)bzWnOQOtiLn16rd2Z*`(qf7z6j`c9tbBpmN-2^7*`07{Z4V>4cTs- zmUwq4F|z!_ddb(*qN>KV(J^R-v{91?LxA*58-AuJbNvK zqJlsrs@e!G{%I|2b8vyNeHvg_mW!n>P?RlIfPY4{+rX^9yw{Ai@RZfCJaNS!E=%9J zce7USCD5WbgaN%F@lD!}u+{kcd-{9nmU4kweLL#JgW!;map@?N)KAUp8c7OtC3ixB zQ#Gjcd1M-xdPF2!n9he9HuOY*?O7+WqGgE=J%>I8x{Iw$WwI zfOXTziv+(pCJS)3LY|1z<=}s(2oozRkIAXewvmv9JA?H&aOihiCNIr@`ELFbM#?b2 z9EwZc@+`&*oQ@W$YjAZx{{se@}1`!&iJ9o_SH;~ zj*F-rSn&fRdlgV(jdX|AcnM6X^f)bm9-X|p?<9-~&Ab9WY9X)zJoiQl+<(ZET zSv@Y{@7T%C!=e1R=8&DcoK=@eN|B9?tn2(ziT++d64Z11`?Bq3t`mj7K+=WfdCz%f zdiGg16oEoqgRED<3R$~HIX5!%0N_;Hcu%V$_-L`t_lY*1}Le{QH5mI}|j&VT8Gt!MnXQT7ta!>rOKdQn*yqY7yE z?3(%pz7hMh!AR1H2mS|yeeq|w0tR8ds;`0b7I8LTit+nxgtrE2{ZcukpOz&@Ah6T6 z`2u@YlvHd%X9s{BhRL+K?O&2;9jyq7r<%Qq5s2Jo&R9vuK0MZEbemOfM?>8vRkM$(CM+8CiY2)3t?Yi?VY1%@?=?b7c|RWc1={HljP2ir-rpG)6=-|^ z2n9wfQR}Tc7HA>F(+P1D7F06rc*+RrU1mB$dI(UeUc6yaqI_)z^*Pgue z?%)v)1MT{c`{G88V|n(&d0!sazKiVoLKlO`yqD_2b-_akXLC&VTa3Mz5HmW*(SI8- zy?yO=m@WrJP0b4_?{h{FNmGZ>u?(aO9bAS*E^ChSd`|t(v-0uOarZ~ZTdndpY{z{d zMvFG8?GlMc-zo5K)nn|FFfIe|ec6v-6V+5-dhtErl>k!Cf1yxzNLkBnxC*0A2H7c( z5ion3)9pW)RFfX00G?riZPJ*xeP^Bbv^6o#;rM^_hk)`ga$Fc6xZEQXksmX@Y%Zq> zbVqrL#5ugTdiD73Up$MV`(6l=BVruUpSa}zqqqn7!`(^Xo%#NYWwNWaKfzm=G7oJn?SEq?kRJ!(_zFGKPwPX>!=d^c`Gwcv3qYi5%T9({2uxqpaXX8Xi7qk z3B|7MShcf}H&tA=UcE%Zlac76r;mOx6p32bpU|Ziap>2c^;V5>IJ-A-RX!K9BWn9 zV!!m;c{z>mUY2MXvAq7#wT`zSTnM>bb!Q&ggK_9Mg%CHO2RqxC>cN?GYhohUy7JA# zwkBu539`+&#*@fSO`ZqcT>4yS7E7=h_kjn^Yh7N-)CYJNB(g`7iq%VQJ@qPT zRq(hTqAMqJ2fE->U->biKLH{Xu6%xObgb+SIjiZp=v%9X1n2eT(bvG`)(zh#JGQJ7fur>e z<7l#$MYy)Bq_saf6e4Hd<7!1p`b%0(SWagZ!ZN1=g2Jh^K`C%kA&QCUvyGl9gCn;xo; ze(N6u52U!A!ZS2@JzMC2bgz-6YZ}cI2LNmNPs`evU8mk`&#ybcz@~{ znN(So)0vi~7SaCh{cW(%nB6`O`;7j(j(i-UE*^!oQcvc08>_~|;v zQ?txD1SAv5Bx1c4JA$>f^kgt#Nk{zB?wb{8V!B$~sw&e#F5HK;^kvqU>)?>?TcSxP zjH%{(f=Z*dC@ath7eiL}*4ytmEj{)UKi*GSE?MYBFQTjLf9e_8z}tf_t|Fd>`IwYJLQ~Xvk=J;UjkU&=Z|Om9(;1k2$`_NEOijP zcuy}_vkZaE`NJNZKZE8Ry_s5!L&s?wb

    F2oC+*4PV~WyG5sxrP4f1H5FsHKTnUp zlMR_Aan7yQ;4xg^c7FFan#F{2obPQslHkA$2~i(uuQ2k4`kt!-}Q!MFd=J(3Fy z_XmEHbW6T1`qAaAAt8X^7oYEHA0WR9rOkJqn;=;Yon8gn#S=b%`P(hmpDPk?nOJYz z777yi&5`_IWfhP0atv)Bm5~;g<@w+sK=>74FsY&tsM!D=>hN6lg@Q8ZBG6!6MPbP1 z>~|HHEcFb;gv4)$*h`P077Rb*qr+C8>lRO(Ar1d@l0EIl?)!8yLQ#SCw_t5B2P=R` zqc4S)Hz66$TO^_WvB1kvAkbDk{p?VI5@}c@MZTJ+Lw_IpZEIZ8jA2-%#HdZFa zy##Dh#%7j{Bv&KA0!J?tm}ZXA>$a^IT;=4LY}9)kls`63TA!DnXHr}eOC`SSF960v z4Jc@Mz!7gQ@0qcTYeRi**f$qsd-EbPvAf*tM9aLH7!N$u_qIp4q)@h7{{onMAT>=& zN#j}DJ9R5%1k^@yj>KtRq~9@_ZC2#>-DNV>nIo4A!Zf_orSwrHI-`7$mDC4Y4f?DP zxAEuE<#j<<+f^{-v&V3|wTZB5&67*hz&4i{+xu`woZxQzvA9Ry7Wya-3|y(6n{v_+p1q-jP6Ov6@0Nd5;m=2^YF^TVRh1P zSi*{P!Q!Kn3W!Y@*{(m|fxmr2L^{AfXGioHTHHeKm#YqnRaR_B?`BZZ$JtPTHtS#u z$mR+c?i0=e38j$3lX*y1plL2Io;g$fDH)g%b_Oj?wZGC(0S$cc(sBs?z)1O(wi%D* zjz2~c?~*Fmpv~>78bJSXWB}SC4Nqz2)&&wff6&p7zG7!$RaXq-4dnHV@->)X}h)=)`V2} zw&Bj?ECb|cmWQ7;!%Ca>)NT?GS@=@O-~pXCcqGs#_lD)BGy~)U90ISA+LI?fZ_Ojz zmjsWN0&_QjH>?nH7zBjNh}jlI#sGQ(delfS5Nz9%{JtzS>(09?46R5?G*a7Eytp(XK09unRGzgo}`T z+-z%rv2{FCU8sWKgK?ZXb6^$!^=muyu)*VsVE; zcl3wq7|nL_eOFnwHUJV%=w=9d2@EeS;jm3$=xT~6$P}!O2h`hf8R6p$wrmoBHjGmE zj_!2_1&I`n^fKlE&-GGGAj=t;$HET$6!^Fb&*lR2gp-?Fpc;ackBy9saP}K-vcYPZ zzLA10QW0|NiL!gVYwHal3)MCt(o)-#o(=JmAXp$&pk71-+-|ZMfCJz zkP#u`axMLX%hmtAR_N3wmy5s%?KrfFh|ATk_{%oY1#!9BpTB|=ikdu;=J#M&c@?cc zI9Ppy(2WdxCH$+Y4v_RbnTBA3+PCTEb5^YO7c?xYj|p%-pSfYp2}FxMZeZLKrfVAk z`9<$~I)-bW0bIftC)^20;WaQ+7R?8tvk7<}8J7+G{&?uSc7Vvlm>j_;{|D~9Md02W z;xr_$_w^rism1T$ZWK0ie^kt{LU|F~Y+ z+VLfY%YASQdInCU_OBurVTM(|@mjNI{idJjhq6#Fx%K4+*IHyniGAr8O0wt*=BINX z7%e^mjpyPT#86&BX{2a3)!DX@TMuM{TOeDrg7z5=IwO#sg7eS0?6@6P1+Y%$UD19$ z0@F=_gX?6zox@aKdoAp$%xzI90D{EMc++6d;z+O z=-33u2o&rMgr=clVUYWx;05G9>={u?C+LY%V}*T&#ghpeKdL1Yfp8i(6>U|^iE_~O z8s%kFk%&d;W^g|Gv_DJK`8tVyByu^ z=w>gJm4A!8>+{jz5!3M%<>Oo{+KqM!b#4<~zrV$1lA569SRh62C7GiO~Hfdy;f`y>P#|7fLE|-|FLGyS(Nxh+=s6>htYC6|N1;i z@K#UP`ZBqTEY(yt-5Rj-C(1TPaz4%0fY97|IaUs=sO|~F)FT;ijUoc=aB`X-736$3 ziBSY3hz$k)^V=wk4ecy_x3Mve;AR?mA^lmz7HBO8tF2aZsTX7c!fS1 z6}$_L0P~pCk!)83XT!srG+{7KadCY2E1K|QFDReJXCApP9pxeal6O}_;O(=i4%o~0 zq)tvsVE9bLc^U3C$381Sq%LGoWv8Lpcp)XXC3Qa6lc}9^5=g8DeltBqEj}l%KwxoG zQ}U!S*5K?5$kg>->I{Wj*jMJ)D-3A~cDaXo?KakkfLr$5*!w)mZxF^Pr+x{cJuYs@ z52h@^xuJCCPIx1*r$a#oRV;Pf_50sti@;)C=iRMj-A}pB&MDTN zl0-uU(3hfRQLR&TArxK-RwYB zwJvyf70mTgQ=)54Hr}p$0?QNdR)j72&jqhvc`H(3powU*-?L*%7ThU-@Qz#&`<;;~ z#b`qqiW<<6OQ~<+T*w7qy{4>7Qs}n!-RnoC2>6!D9RxV?)qw|vwf-p;$Js6af3>NH z&rp=Z8yT8I?McZcA1#l z6@XDCf8bW!n4g3@iM0p=&eudA8onfK^6Y%Z*(_K5-JM6<~QYPyE~)owW&9GDt?!*=7d^6n%N*TT5uKIjEy@uVx@qV1UTq< zWzms+osgR}J_U=mBfQ%AyQhkjNkrkuv^Jd^u7U8zutvu|{N}Q!uN(>Wn`n?vTl%ka z*{r}*o2#H)w#gB07qQ3MA6cB0uj+#2;OCf}q!m#dfh6$G1F7Y@{ns7n)b0c-g510F z+jd+ANc<11szrN^32l-0PB{QuAt-Y7v3X+3iSQlCm z5Qk~g5wo`~r2pqP;W;>=I(vnoDj^vRu)eU~A^bbeOPYR|Rq<~B46Xl!xd9YV>5$o9gLdv=p#3p9e0*(LZ zUh##1pI|c}yn1>TK8f8&umtUKe8#R43Vg2!sY}4U?^%OwubsyQaBvk?8(y65)t2}< zD^uX7>j06BBd~XP1}|Ng?(}7kD-eP}G35$=J~}STBiXlVXw-~g^Ed5K93K}9w5?AI zGlw^RJfg@(D}N1W*XmCuTsAr3a~X>SMS|)^{ojH2c@Aqk$-!XQOw|XAd5c-x`xFo} z3G8cjD!1* zlUjYLk52Anv8A_{COZv1Q?qPsCQe}NQ6`o6ZCH(3fp1%Jw*fbFIYD5YTE&D_-6#Mz zS~z6O#}*u(crQtcghUif|50$4rf;mV><1^_6HrZ-TVmFS8JCq(1@|RwtAut+G%m_3 z8RvL`cctE}r|=R8MCA3>l)BozPKo!o9~9aY!obi>*=54FwL%aONHpplC{%ymelL(r z6j#iTu^$_|1YX%zaA(4}D2Te}wnvcdE(eJh(i0&%i)RdMCPG?!p>t3}`gzFS3qXDR z8TQ^KGY{Y~bsNCO^darX1?bD8t$)>*Td_l_v&+u~Qn4(uqLx2=sP}v91Oy924F0#D;AMeE0g$Q21f_9pENC?vc<< z9vpxSv>iKoK)AcbWG>YVXhTqP{83PKdLvb*Lvs)N_9||q>WpC$|5LB~x5eg<64QJN zhUHZF;iJy?aExv=pjf^lz4y8H*7<4j2n}vlR|f?6G7@HTsYI@xi?!eJLz!+SCt-A} z4-N3~<`b-~1(WkpI{4wly?*Abs?tcvI%Z(~YU>}qfQ0Rl-rnRasz{`*Fmk~t$u6SO-6x2tVw-P}yWgsvr+L1`2Ugb1=pK+bi{ZSz z1Uc~=9Gf5UC=&Ia1YTH##j*Yihj%f;Z#;>8VF)4utLh{P8#pwh5D8c}w<} z<@!BUP|{;Lm|2ClZbJf4%DUiJ5y&R_{ntN^gui$pPQBvs*CzUJ{uH1Boz!ycH?4o% zFR4A~HU4E%$5Au_&-p*z^QW4n2896BKl$?){mnlHi}IZ*4>sNK#Y8Ngl^_`IGOaPV zV<_RVU+ie$`tL;7U*D~^Tn*MNT=!-Q&usbb{I-1=fibSu84$!wy}$9E+dAVJ`>f0q z=*>myamvvX%T%AZj? zsNzW6Xz1asdb>B~FA4qM-br~5xHLyFNJOX(wIF!n*=RX62Qw-p^J!y{9N5@A z(SP2lzx@EiDo_j7(~1z}`2-ab_{y~Z{mTE=%>3j1`j@-hj5#pbm8W=IDVzn6m=NgS z51{d0vB11H5zh95)`m!TWOScI5MF_6wDLr;ki0#OR9ij@@I1p5NLTqtI*jUL{+sFE zlG~FVudN{VO%bf1vwMG@ZNHA)<^im&k0_xMq}9#Yw-v{GigF;M`XV$zhWByWGM9l8 zsZz(iDd|pEwuZYqti*z!QLo3!y!+ed;ID`C+&Oq6&M}krO0j5cf32y65RM25=VEZG z)`R}EM@M%e)1}PpkiT>^eZh4KlV@&ObOx9&*SPrU>wq@b+6sOajuYq2P;wD-z&z zB)o-`7=h(omwwqnF{9E;{ao=#fQlR1f>WsjmgscNU6oHZi93~aH&VPuK-98{#Xn}k ze8d*{|4Bv+S5>g@5BYAcEzEeITrd%13z~n5cf)=c?}DARKO`hTD3mZ58y$fjIk7Xd z+NeH?6Xgqmg)T~P7@D{%2EhaLm=YMUWY-Tn*)hAS*j>sBIDacwrRrmIZXqGw^a`G6qV*VO&daZjk%DKFL zAS%ayzRSP5z0)bZto=HnsG)PiUb2k~vW0ZOD2X~;>XpN@bLgV0bbCFT(_>a+TCMdL|*h? z$coRYfWgA=U}kg<^{N*pHC}0X|NuW&xJVm^a)~P7r3Bt&*O9b@G-!@NkPQ5eNbvH z0=PYzw$!7NyLK*(iX8TYG|kMZFDV=?=qH@dRDQtDA>Z5aRIMoA$HHyAEV zysPJDKtZ(|5z}+bfByKMK?W}VE)DbUIqyha0LOa+m4No@2FRT{&BjD4f0OMH?J9W6 zSIjXBqVTTK3FQ;JhaAlBQDnXW8&h!t#NN=}>uc#mn2M zW90*n8jK2>_lQvKj<)Z^hORe6i3+5drT9+o^tk6%$5+BLS*l;iOQqw*n|5ZwQQQT{ z{%2Md3|MA?8XjJu`m~WajC48Kd&u_6@W1v);ioaLD&mau&vmOTtQfC+6b04~GrErB z<4ks2chyoX;@hzV-%jZbkAAMv`HJn8*P^$h#0V*m<8g5`uXseBUed%zcLS^W`6UuD zgf4VAFtET4*VQb#o;gZLKJpx41rjMR9JvkbmT`kENXJ@pIjIp>_DqURVAq%Y#rqi5 zhkh44BtVsu+uei*s(r_(iKKgue8g<1mj@5YD?B_;-R0v5Yns{a-jQP+I=2T!#hNQZJOKN*yuaoH86K)8)o;l=h1|;cIUDWMY54K z3Jr@41zwH|%y9yTMZ0a6xt1zjomh>XZviZEpFNz&??%i@`(sqV1u@npWH zCEBv}7fi#j%$0u*7bP{7VaYv4)x4)?e2KJP-VAc@b zK=X+pbI3;^FR2K<_M9&wWeU3?$TQr7i;yxZ%i93?-VShTosK-IU}WEH220kAyV7C% zcIhgp;{Cn|;l@E=CT4Ki?@y;qq z1z|l~PEkONY6Zl}yQ!O-{4|JcG`K;7o5wvH8@E%}rM}2vN1WlOaD$k50pjihCs<}7 z8zdqdGDgp&U0fX=sE56S?jpm>Y@tj@<~G}3?C{z4H29EMN7my;c3Y0EgQfn|@m7{x z7hs{;mQCC5h>NN`J~aw{3RDNgY)#)#(wdA_cV^UofGc+Vn9&;eyjU37NUvT$%YhLR z6~24^xY1!0EV&wdZZ)%^zcW8C(~YOMeS|EXjdbeyy2T?F^%wFB)lFo(5pF4Yc+{?w zpnUlLtebO&gcv?`%2+#oirsOoX+=(aJ9(lCOBYdn#qlG4>*0nUU$g;#)*!#z># zrausD2&{A&b<9=V1%x_pS4LzXEBqAWoF-N|GOF}EL%Q0{k<=(7ThZNs7CbFpIUmxA zR(7wW>8h^$EgZR5E?FKGLtC_CA_;!4o`5+&;7UeeXwx0fS8oO^!pkq!In{7HEBz&Q z77h?=9aV5*b;Sp48*lV;FOWLh0#b~l+eu=3`PJ3`kG(e!r@C$Thf9(~DO8lGlp(25 znP(w|3`tQbLZLE-B$8%PhRl&6B{DRab!#%TjG2eZ93o@p_j9@Lp65A^x4r!Ket*aN z*Z%L>_iokte%Cde=Vv;%>a#EUN`qK}XntLnj+Nu#Jr!|xrzGhq^qYuRE&NBF8`6de z-S1F~DnqGxLAuqAmt2e+H@|R89}bhiu1+Mox^=_@!tsA+xBTIM^#`JDUmc+$iooCj z48oy5GlnSnK&-A(KbM7((}8MBGBuW9BlEstH<9Yzz(!}LcaLTvgKC;QU(y%Nb9>kq z3#E2sTpiJwfr66fY}QY&#WYvFGKM_Nk<@RH#3NP|@YJFhH*?ex33>1Mm-1`j?rIaBUo@h6bHPli^+u#7^&s=?^z6=kso(G~fa1^Z z!F_0U=gQV`>vrywa}GiKqF$R~`ib)-lY&!Nl6*AOD*3jwC{6rf&=1ZqzoG?|_R^5g z@hCop!46EU<7g_CyEeV32QARPbV|7$jJ8FnHmoD!ByZ3~+Bt-&3#GsP9P72;jxJHP z-Xbn+Gwci2(cd2wvLK+cXd1IR<7_~(UmHGVHs1vda(>~9%iMmv>bh64DM_Xd>a3kO zK#d183J?vhbmeLW=T=cuqlo|c)wX$s=Qy&!+rZ)PbR1&X^-f}ArN_c0t$>xpzSAQ( zlg#~i;TN_mp{Utlrgj{Zdm(hLCl@_TJDRURl3YE7CSvUIe6Y~0Cne{uwmPi!-32Hp zp9-=s!vDwHbjChinW@a+t4PNM#P0BdRB~7uEQiZfnTvxt2_~GVWNkV3qsvPJz~9xi6r1+>dX6FGMnh{z_!>3hr?}6NiWd z;|^l5iIsRAt$(Nyaa#%M3q;H)j8B(Q=uNA;HMZ-rIZz6aKBSf*Wjy{C(CBgF?<+$1ObQp@zTu+n%`Lr9X@5m zOpQS=}}xnn}^q%aK@_BtEd; zd`_7H;bPPwwrs<~$}OF&*K-dvwL@CIS@ae!=0lOz)fCjEE8I`s&vOf zvySuryf)_qJ+y9&98=UAHyoN+5qhAQk@vM>WLyEj9HabU28I1dstJA6`~kWjx}nTB zD}|WV^SEEP#!S^06cj85&%tHP^nJ8qoYF1HQ$B*WRSu0`3rzT0+NtL|0ljzxiSw)T1v=m06DM>b4Vf2n|Z)1+>h&@J<3M`+eI z53xBn!{}1m(dm7OSD~bAgvbkXcq3E&(uF7kR}L+p;}}m(_}rLovHd&LMIyr-<`ZZj zLqCt?TylK2YiJOzndjXrtYpi&d%d3n)6Q9beOS6J#JgfS)w5?l?L7j=?lD9)$99xH z?0{O)&I>8V`E(OP8A1Khs+& zp;G~(tgF_ndlL1uazeF%brT{*dNEi13{9`EmSo2ehEYU>m=dHi*=nU+@W65m~z?tYt? z&m}$^-~#$-*!P`r5w@d)`-LVQi#}1#Ki{+eIZ&fb}wTNjn6>&=<@QJdSh4naB=9AEnO(q5L;D%S31dn zf3Pp5xAnpTx{RTR)x!5A&V*b_vZ=S@SML@-{kWVVM?6U_+SQ7;tUJurm}@?5`4;9r z7lG{T_8)WLo;>2{J9#r7A2~ko`a=EG|e4;fg>*y#+pcQU;D5AulWOCuW{`? zr?#8_aN6U?@^3sAh6VCwCB1)R;9vq=i-?l;#_aWv){0bE{3_%LU3uBe@rQdnM3%Fs z+O!p_AxGQR^?iIw|Ddaf$itu?V9o14e=={rKR#gy3B-9Jy(fOGd|G**HKGA7(V1!8 zhcm5Xy#B^3*X|s2>o4+oXIqUj(rQrVY&F7332T*&dNZb+M{Gjp_STpmb~v!lRdEE# zx?TD%;ro~5!ay!RTyO7OKYY2ZzQVA4`;YGQ?Sr68Or}LlXzADXVLG?>C~xsEYK6so z+HWN7`BAVw^J$z}HkdT^wnqEAM#)R=N(^ccgdy#8LmsxApW6M+?_8xMr?ETx-DJMm?hi*4ubt#Bv5}&@m%e4)Wg%18>U2fSgD>=?g=DTxn*M#Qh6W& zud(+Rx)oQ{GG_1cPj6w|H~|Y(40eujThP(xr#}nSLW$Rq&xbYE+S>Z|fBt+?x<@em zHO;F2Ai9@!CuIo@gXlGH%D_&{eLC#NuyG6F~WltY3enLz+`kC*uK2dhY&X zh!VcY!Yr3ZezD1E&(1mCrv#DW#pghRO}ac==?Ss{aJP3jYdqPqh>63lvetLqg)j5@ z4@4-2ZIe`~}T291>phXl7WxNWasEdY#&;a6gAj^Xq0muzvwd;Ex*EHZNa= zdjV#}o8~D--JDzC3ZXA)3)1w|Ipkwunq5BSFfP{>*e6rYnxC}Lfq`%f4_StS#BL~4XA(aLmb{# z%5+Q5ueF;W5mx#H%g2HOUxc-03VoCA`t}9?sa(*f-sLooQVUFzq%O*k;~vJ>)jmt% zTrQW_L88NRfv#mb?`1B0_h$B|{9@k6@1bRJQLKT3>bl~#qQU5(=@vn&5)S}^mo;g_ zFAk_x+*g(7Yq~Ep%SsEb#1ebAEFzM6o1#R>L|-okC_N8i%hry!vhSt|+L-4eU^{1Y zxGK@HRT|jx;=|1NF#BwDxPc%(e2Ys)k^0NHPU^1>Ol+u0oENl`q>*n{o&rUMYLg>$f85*5hT)zq0`T{6jMz zAY3Mg08Btf$R*Q#->iY&#Cu*}eDU0);_b z?kK#v>!P|ij9wx*{bLnx$?N4G*UoR!37T6B@Ewach7v3}rt}4wYQiVqGxF;`_93xv z1MQ!>?}wK$qoqX%b}c?xx_`lHM@=S=VkKuosT6My$7%9 zbZENzOj_E9c@_nIMt`i(j;Sln1r5n*3UiquGWk2-bS0^0xG#C^*u^w9>9VwTi&L7a7- z(wJ#iY2=mGPa*nCIVE^^*>`&H z%lf})B#!^Og}DI0k`!hi(D^8 zJ^|($1tNFXPapgeaKzeI#&!H7!7Nrmvvb&PB(!vYU!=M_LX!40Id^Z>A0`p!P#y=>@+3Hbu`@t?h6(v<*QK6`HlP}F03;z zpTJpC1ogUU-sSlYE*bZ+Q_1+OY|jTzg)clbevFr0+iUlup|)5lX8z`mgR%SACC^2# z=Vj8gxURn7fP{#jEy&tnu=X{qB;ASRq^2fWb=tQT`rpS*5~39}DA8E`lL8kfYhPog zJ~A}8N$R9{BBtkHLf{ePbfwZ-WRD!SF!!S7QBf_K<6T}i!WGbS$|Cz4g0Dr2-ik{`uIFar>55fv z?$SQXk#XJ{15ArV`|WlKhV4z5eTB3IVe6JjBdJ@w_- zU`G+|R}0SI65K%VJ(CTuzFE*QU~(`FgM3pDbO|0N792?T_qx+8EVclvon>}mUm%Yl z00eQBKuKG=Ozb_*^dA}B5Lw0tT4z{36pD1fe+xP}{Sz4X`~^l$8j5ZiY}<|+5;Euf z_Hk)j%Bk2$rS1u~t|*fin#=U9Hv6++QhQbx zCbaTshD$k=G3{>P&S(g+wWW(pYdE7OZpOT~WU*7+* z5{8JvX5SRr${5IxjK2zp-8qe6Ozoi{iGqS2g);TkBSRXh)DBM2MrNh)D{`k2 z8pO2fap%@KE`b|FAzQpHM&IQVU5t9cGsE=q&v!ANzE;q`LvY_7=DiD)IRsO7h4r>$ zpx^VQZAorZPqX)hu5{kIXeEbddF?$CYW(Ue+jf%MV}87#&fAoEZ{Yh71jcj@T&b#p z^NDyp?9FBD1;?crRtZ^pyTD<}U}h01a%h(sWi1P<2=Ag9i>ibm!jM+2kC0)xzUw&l z@FI3`o9HgrcQ@I{l+fiWQ6eEmxK)IWtGglEov11LE763vmmIQ) zZrsS>QPFEIqG0hc5e)4}4PtCoyqt_S8p{ovUEI3CCuxOedGz)>x91vT$DE5w;#aTs zr8ZEq+*pyXx^sXU2O)mb3hNL}z$tIZe}tO>Q_U# zjUGZy^A$7~&@uXugXwdG?xIK}0&K)on}jpz83=&Xx7!nqtybq+lm=CVQJjeNRcr82 zeqd5w-~2}(VCxV2hAKs@g;77q-Uli?euI=~`;T!Yq#h6plo}epBzX<<(fBNybF%p* z4x1Nd2$5Bl;yI?oZAkAfJn1t~$L;HPBP4J0Z2_WQPXG3}v%(6}{e)KYY;hNIsPwz5iK8 zo=?-7VfjZsq7Y->$tQ7(YFm3*js#4spxxKj#u54&UTVWvC{-RYg5%W-c*0$YX0HuV zv-XWg0_iGht?i0hxhny^Z6#6jXDpqcV}w$?R-^|%R#ED{a47?l3PIdJkfml&xOQ0{ zs?qjJWD)CsMjirlLmtGeAfr;=r=0t$5P#s2SI29TFX5&FQsQ`yNI0(q!|Wq063(PI zc}z}uM$UDObJ!lZg>cHBPmbw`BA$0NNQS$sORsuu3q2y>p(Q)nf5ngQz+X;B(tA>h z$k@$SMZ!Dj+jj2atb|^0Be|udWsL4=1=5o2Nprq^`8t&S5ub4FD+j)@wa%Ry3F)*v z5Lr_0v~;424aNC(#1oL>xa;{PE+Q2vp-2X57F)GHF1>ZXOh*2(mZ|shk5cZSAcJ;G zI8&c0a9QpmUjH{+^&&lvhx|qvb~QFuNQm&i$+o4=b$p_EMkv*+-m~XRm2>j}>-rebBxl75pD`I)IY$Hk3$W+Avw;Cyz!z;H8M2K>6j?|TTqwzQ zfN+Xb>4wA3%p9KIq0?bpCskvA6j7kXmb)a5>_f5r_VAA)r+ai^qHo9c;t>f&?R1+qrNFX51{#Zg#Sc*-lu)?Gwx*t)(#dqNvUTgOVHZv= z9MUl}4;HtLW&eP#ItEmT&D3yp5o(Y&sr{kqHLhT12dZWY8KJT8Wm{N_4#10Q%1lBY|`^`yn>@4e?F%~D`g#KRR92@-t zXQC=>-zI?7j7KZjgX}iPmG*0yGw^YU|(%yJUebD^WKa^=mbH!B9Z56yHAHolm~6g%--2}x>?XNyxjjPFb)$<>$D#tn3oaCNbk z*o2L#n9WPs#q^}B=fL~MHurqOVxfTTkR2I2)~Qd0JH%C3)h64M4AHmg}R^ zZMnXs8-D}t{{=knO_1P|Wqs!J0K5q`a-qm6=BJ1mRRb1}Y)%*8W-Tr5~bctW<-Zm+DViXOI>+?0Nvj5^;w?V|U$IQb}-K3W03DgnQAGEsH|;P2(qiPXCxv7^|!oYyo*VneL&L8bkA z&q|9ueAQ}evHfAW%v%LAYf>oBrLa%)`ZV?Uy=W3kB2jt&VhK=}LV_GRmc?`fN^2Ep z!OxWiT>C<%76A3> z3ioOB&@ulajw+NKGbmQTzZlX>$iZn zw)u`Rm&NNk{;+c;je#KZczAhUtTk<9$}jad zrPL4;cfpmYo$9 z){4;%arxJ_Q3J>2uVffZy>#u*o+r=OEW1Gh9Pi(6M~N3(U4oZFeFciiEbQU-8|E_m z(6ODJ4IgC=k;AWn0=(9Kd7s* zLuS^xI;+djf!%BA-#?{_i{^6Ay)F4)tEm4)7A*8j6n=7s%eW!8atS{C z?OMsdcHp1S(+xynt*ZDgdF5l>_XqS+-K5{fbP zW7pXx%M2qxE+j%1Ickz$jd_R+m{)2ZNzgsH8N#aX59}`^3-*ui`1jZ7$NX7&)b^u0kZ7=y4768%eh)&@)`OL3iWAZ07N z*Sa&R0&dfj=b5$;>x@jIhV63P$osj&M|I2RcDDY4v(-;?ozgD*M<|)fQ_VE1eHJUE zi;1bG$|cOd0%wC7kb(V4@FxfY|4v?@iAWS5+_Voez@HRC-F9M~5piL-7IU`KKq{Oe zm(UjS3E`I`y*$Nh;kZ=|}9H%w1SIQ*h;$Qims z?W_yutLa0(=AyGN1Sr_~q@TLI)@It=C2v=3H?K>JC(&W2RDV-zOOt?ilVmuZSO3AT z&FYCGr@rD4n_SOa50=D2^vB2g>J6g%Hc2#2F(0+sZ__tCe>GjGT=}QBthS5a1h|~bZ;*>d#Cju_hjwenYj2vU0f-XjSO}H&z5~QG zj*-Tc(-S& z6njw;geG~hlEFgfiwI$RO)1MDvMywwiCX|ywtv!X3&TZh7+S3K!Vpo54J1kvU|YVP z492cnG-kZuuh?D%Ru@hE_6z;TiQ}A+U04at*@}Zn3rPqV<_vj#wu0$yXc!almOQQS zfO_{3p+vyCMLm^i`kBe`mAi%Df##Fonc zp(x)OJ1u^u|xhG;Wl202{u!JZb{TWiO?cKa*EC@L%vw9>$VmsYSH3Oo(=Nh}HE+%&`H+bA3FZ!@6rZ#LiR*_Yc;s69(Vw@QtuKe&W&;Ml z-J$1@B;(`evtjQ1<-35Iun1I+KMFhZ!uNmbDJe)D-e2T4Gto>^gboa=1Qyq4*gwts zhiIGN2rJmLKqAx-Kmx6}aS6Tyvi;;YvC&CfA z)8kJ=oxjJQzuWwI^&<8y%vO1F3$9l6oB~S|h`xW%+ChwMQ8H5kS^SKMP$h;Y_oYo? zpTRs89u^B|M0*3F>^6mc-8{!p@cUdB&O8szWD#I{OTL$QTEQjWx%-bj;C2zO{s3_4&i(8#h;Kb9_`?PX%*CKpdi#h5z;;;b|1FTzW^RD`HRI_ z(?~|c?A{GifMTPqB!}z`Z-+e@`ax!_8~CNLNB)54zAWZVXo&48UbA#t*`}Y}mWM4r zT$`nDZM5BvWpvrxN4avFz8`~&NGWGo%_U){iayYc%cU2u8IJOMc+JB>XtPxBkZ)T# z`)ENGu;BiMzo_4CqiM+q%YM^@kUjn$^?2n;#<{qD-(6pmS! z@2l}Q5SsrYT%6yGv&P*y>Pb~1G;ggPk8L}2MGG*#?|MbA*W=9?bV}mCp9&6rX>@F6 zI%1uK2@(T1k25uf4?7F7tG6^dcRwQoRjaeFE#q4E(lET+^NV~|%QCBLEwd~4IpVg{ zhsxD^-G9_1E0Q7ZxS%my+-=sin$hAdX@U#!|LX;vwLbxt;3bXN0Et4o&o&vx?hE=R zutfUMKPO1yHv|{#lL2a|Z}7xJiwwj@*^u$W71EAET#&8!v)3IbBYV$lQg!e2awT;5RFa*@(>vh3uXQy>RWnr>vc3o z71PNmO4)CwTwd#&^%8oFicrwv8<*h+pq8~Igi>3)cYj;O4# zI`l_=G;RSPngY?h_*YLB!+9iIlq6gGb&Wbeev|0jeHPZkeyy{A5Ir2`b!tgi0xT3zCwuu`Pwj%`+^(~jzalCk~*KAp1>;%9>`m;Ocfc- zM4uq+JY-(h|CwkXQK;{By>yGB# zOd40M$B*FpyJs|sM3ax0;C|ejW!6xpk^J8_l-+pUWqsJpAsBRlT~E!9ge};4J0hcy zWB=gXo|?sPJRl}1f`%vSNlyx(+9E`2#2|>qq4qbA>uT9+PS0Z(u<5CW?9R?KaucY2 zj#6GCQsFlkYCbY7lQG$d>dQi-4`Ml-HZ6|>zweVG6r$_Vtd>`6j`N;e_-*m(&V*CM z{mVXx&?BYXT#i#5@YFBvu((S3r3)rSb5+N%vx51dvVXi`wH6G>r?aB@iC(VCzn-0%x)C+udD-G?zer!?ZmB-6Vwb8N^57?o&KT*>LNzG1qpVFH` zMj6+=k|1pgfnGrUvOnJe1)_M=xrA}Q4>Djj+zS%^uYkG==0LA!Bn=;%aA(K&1f9c) zBAua7u$~HdLJpM5F4+v$kiPz2_?iP`Q6%R3~u~?DvKC$GLmM^V2Hv9vOncrg^v@%>o`)Z0f+R%Nw%$ zYye2P>q44wMQxl`WY}hBgiuJF6%pVXMopEm$C7X=7q#q$y7EnhIa4D?QbU{JeJfs# z4!oydE&5Cgr(5(0-)OE})ch;>CqGY~8k$JVW$Ue@jqXl z-T9z%_@LzMbJ4CKzU+=Uec69M*Ve7DD0vZadg&sSSIa+cAc)=2bM`q>D=PtbEwIGb82xB^ZF9R7n1lKWrI;UC=Xe>sQ$=DA320X_G*V&Wv?Or1L|QwHQ; zTRut4l;h5(`AN&zUJ|hlBSr$DTilDfp_QooG3r_!Eq(Fvp+kz=kXGM4b8Z%MK5rj< z3!k$fh0(n#b`I(cqFf#?F29Qd+?{l2S5>(G?ff}450F*>Nn>-5gFiwQ!iIeWQ8LB2 zp&iRaXQBJ{rMB<@@)&Vffvk{x%QQ!q+wV?T6DisZqSIIbDO^7^U+O^UO@X4EVn);N zhe)bSx=PGB?5a(T;}4Orf6{QFgSf5(VH3PtpG;NpSpMBtgpmWs*Q-U};8Z71W1ot42||fg@1fmmZF~ zaZ=^dudj})RcgFp2pLKJ0{1?dS~Lb+X5R!vIX+JlGxlmo_KJ2` z^lUs%FmPi2A+eE9yGGS>n~&B5&f+IZc0IH|_d$U695#|{YX;HP0K`AR8^W0Onq8Hs3m~n$TiWyi%%lXN z1tU<-y15B0l0QA%&VF7c%t=j=^{6plSV1iD*t9&>iSxdQ)TUTN#dR4x)v)gspT~njq z<|yng^-Pu+P-c&rgR;LM`G!PNnbYu+%y1M3jGk7D?U{ zX-OWK{_MHsAxWSjgv=I_h_{=7$E}FC&-EtdJpsCGCxMrdoZ%h`rgIiPx0vP=ME7B2 z9xP9*MZXnFyr*T_2EmiMgti}hA2<{MLYX{A|a)a_*5k-?z0_A@z?_?+ijAaG_sT9IFZfu zv|zCnkUp8K+uKXYE@1N+02}OTO7Rdzbs*Jz?RAZ*%m!{kJeWTbZ8*572L}*)&8L=;2fpbq4+CRq9vQwT6 zMo-S;x{Y$rYe1G)wa-D}iNGVx@rvl-8OTbs-eykSuRU+R8=C!r>cM`avsoTmh{p57 zQE_2p7?8Rjy7P?h2?oKTSHbb!16?V0%LJFgDRIfIw>Q(jKz0j(^`0^ry*!KkQ4S?@ zS?#$KwJ_UBi#*j6tUmYSCqLo80`F8dU$?Gu+;;CHrcenW3pCZpwX<}6LEyFs*B%m) znv^xLS9_x4vsoc7;hnQRoGrs zBfiA`#&#ot;&f2L>WSGShjm!@GoOae!C3uP*9u1>*!u_I5B(H!+)t=tl|^R%UrUVC z`;;Z_4zv*3+TC{Xp6<+5Y9Gex#bDn!y^+*2P2U$qK-nNmmc?|pxHNSUl^nMds*8JO zE{17q&oG~^OCy6AS*mLug)Kjgnr0-$xr{c_jxHWr{EQ!ZuzevhCI{rvYyim<;GPk7 z-Bq9T-p{8wN;aoL{sl>g9-mZ-MA1{VuIAiaJO32Q=hIHg=T^8(eEs# z_@c{Zzi8*yZ0bY}dF?2JT0)Z@Z|tOJt^&Epa2l>ioUMzS{o8v5Y}gii*TKpzvypR9}3NFWUHxouk{7}_Yv39K9TgTYia@c1)*RQ*EHL5 z2D4DApBb6XUKNRyb13+7h%vV9sYV4mRagULp&;(Vg1n%niy&&6-DadMozjVI69lFq z>ng)@?v{0F1wv1;Jc%fI$z#I}CtGBXiF0X^w<>G%B@P?A7dHfW2Vmd&cdbx}nY#Q(d7j z=xHIdoRE>ybFz^r8ldpV6T*4;%qa69u{%tJ5??bS^LkD)Nw^}Xz@|OOtT53iGa8(O zL;qU|wZL8%0Q(7wgSCiaCc2`aXC0WWY4`cOPZ)Vfu%z1%4Vef#MYUk>>)+`6Ijq80 zxtVvR2CgGNHc#&?Cz;rKswCX%(h`J4?SA+7Ro_gmdbh>IAg*81HP`+3@2j5pKp}-N zJQ*ZvaWBoXwp_1WO3bd)NB6Ooyy$C_-%gd_{NDh%z|Y_vg)sJheTvk90^(uz&~Xm4 z{k>O&%JX!9p!b|Sqf$?-119i<6Y?unsoCmP<7SC`)-^N&%x7kY*b&C}N?eiL_Z4@l zR?^e%d#Fz0(KQK8|({I&rBeTO36&4V(Z_#pM~sxF84#&{axbH)N@^a z@BWaot=+fxlDSUcV7=}rB1m{s-s#H6#vYu;ytGT~4z6WdZ(#fwzNYI!Wij=HdcmD7 zJi)OySlD#$q3v(C(3=X;qf(RY>XD?7w1v%;cJO{l+=?MHbKx=5H|Ugy8)g6VCn(4x z_12oj%Sp|@0^Z>$8N<187a30q;@ILvvLAF+@Zk3wR?! z<(?Yel(R#0OqA_6K``GDNNpLawVgvRHv@^}KwoOX_i%(o+pP?N32f(6qtzErBmO1& zeGq7Kxk=#7-(IzV`=@e`xVJzASNU_oQpA~P>PYL6{{Er*H>|YpGj*AR7oN6H-EE18 zX7n{w;D~l zRS78~mBK`e9yck2G3$~}c9OV|4tkYf7w(#$K`63#Nr3rXkYu~U-43c@)oUB<@pd;8 z4Xt)y26Z*XfFAhD9B)JSkqJnxd`Nsnaoib<3lz1gODlGHjfsEw<+aNm2?BRgL_E$e zKZ^yw)u?~`*|?igLbcUjFFOWW<29Ix+bk7$wrD;)r?T8l>Lo90H1r4_YY(ntfM$s> z6F1kHq9s^EgvgQmFxconfaL$z3mYTJ7R`HZPlK3xPfP`&59m<)DT9pRr_rW95$hrM zLMi#j_wKDIlifAD9!cBow3_JZntZc(r3K+O!CWVgCY`#l^78VvBR6BIzgXsvcP_Jd zg05#<|G44y{Ac)1stYIQFKD{Io|DY9TE67h@AR@ z^00{rTsLlAPi^m@rXg~Ep0)3O|0(z9D9st_ua|OB-*lGPy@j#6tp4Q-U?+c(yx|k- zz!yDy0gp?UE$76&{r5lcjU@6#usHtfFZ$z8$Qp}xl`_+ku$}9E9NAneSorw)eW8?c zVu1es9I4~PSMqgB>m5EE+|pDw*F>vYVheD!J&cRTXNmc1xfvu1wRQ2!^DUco7$~|A zH0jVr+K7pXQnz>bh`2xIW}P`7ked9^7|xa(WK_pV=M`J%-?<3$EKuIf%tyre%2L0i z<`C!a!;!GV;x(S*9T}^4NF}^u54RgM@@*xD%5eGsaDZnmrIT1Orko4wyXb+)Sg4u(T)F7e%Pd5thb}Q(!aGD$EW$ghPO}`{Q5(ed}Gktx5Jd!_uVj zGu5WXZo<*U0!G$Jm`w26fOt5)w^vFYQ9rxhVvDfd_gXT>^Bfntc&m>hO{SFF^hF5I zHNNcB*ucz#dFsbeQ3+#)GpS4gj{c=<^vc?ZxUw;ulIL-}Cne^v+mkky?}EF*DDPQ# z503sc9bM|L`k%Rx7v>GbkCFNn8CKD_CL_B?(2WR{NRNghLq`0`Sy-|IkT#$c)|+&NJE11Evn%NPhIO zcQ*mK-N(a3onq$pF~z$ypew0v$Eb+)7#5mXL`k#C*=N%aRWUqh+ql--FL2`VvdsBMs)l{5SGzX>QOm zl9(-V%mh&mJYH!9A-6VqXw5A|5rNpU{LnQbr*0d}iO`WY$5ok>=AcbMZdtHumHJFgI+bM-^gJt7LrJqt6F4)IBbs^%SsM>R@By2}B` zV^HnG$z{V3;)#DHH&oE>v*9xV4?e_KMVuacz}Z5Bfn>VezN5Mtl6za`dJiPTZ)Nh} zvv~BGYM7#Kf=m~U>ctA;tUavH6|XQ$@zuwjrGCN>2cZh#(qG_-mF)6T9Q?GfP< zBTR8&4#E3^MGIu?WbA{IfyJnc%6)6LJN0DMv5Nm!tf$TKy(K5v7qEOwaVJ3%OHdz#9c?@uS%RUCgvVbE&|I6V~~ur0~WP-UKb z!{WCS;e|%aEasXy&;u-?Fvsa=nivvcSTvx64wRbuyuCtp`sXs+A3lmpeIcflsD_Bf zl-QbVFCueB9iH&w1TEe$yL-0Ngz}nE-Lj6(Da!=bN8bn z>T3+U$SE@OENsve-9o%#W9`P6o;AJcjLI%3%@a~M%ITR z(f?eDY1V<}ziJoqEi8~HU^rT5Kf5n`FZH8l|4NM6Y&qc-8>!y1uz~U7;;?<`bZihH z-lKq&il7JNv>LV~2=8>{v&a02{ltKU*fk{jDF70r=?NVdp^zN1fuWdYo?_eKnENA`hS6%k*VW}g(D z3n>449;s~4{ts(BZz^rDdCzY45e#Ublr~O)edh{fmZu4ZQ7CCM(-R>_E=`Y_jTIf6 zKGAF~vvV>HkWH;=piy!1NXNq~OYZ@larsfv_RG#eps^5A`89PV2+j)LREQxLeeu^8 z&|119PJ9LfsFm~gWNynz*Em!l6JI?n)9B%$ z6&GrAlbB0$8ZRD&q2SxGM9sjpXP{h7UkA*ywYK{7j%ZH`6u z7qP^zZDB4v`gV;c2k0=5W~fA(T8bROEl*8dhPq#k7 z==yH@&qyIG3o>g6aS7Ngor{rw@!ByrCsvll^brvs<=q+>JaRG|dcVi3C>|oDllUrD z&w&O@PV-Dm#_|Cr@N3)o+n1NaDcRp(TT*~UcB&BiSRLFzD|PxoTLmE9MXVaKZb3Br zHr`hpw4cpj-bAtajK_E}jOHJ`<+A$XdxpFY{%&vni>i|T8j9d^w-xzZmY-!7Fi$5# zyHw%j3hk&vJjU@(IB+=j-P}z-Or+9CitDO;Dp}|KGVX|!JYRMnCVKdxtWpEdIqbtU z|1J?*#jr!2$&0Ye!?7)tT_tR(R! zZr#xzjivx;!#h4z-qzBT9`;)-nZBkXCbwsXc`1VeN&we#P>ZBq%X~nFRlw2t>H_8b z<;x5;&2gk-l_TYnx4OFer#On>L5w2k2uFI>N0|M;uFZ5Dk~H7;wFU@fnbt9Y4xK zj0njY!jNV0+iaevA#osd3eE(J%m-tYn+n@0VOuhLUxsu@Eqb@r2gMtTDl%nu&GSX; zF34FULtHKkJFf`yD2iXCobn!6rSW1~w`gJ_A84qjBB|ZPocC>;V-)!13UTJ{0b^e`fa~ys6hz#< zlTsP4(}uB?c9bxj>%`Y0`61)c{H;uuU9nS4hck+^v%=GT-EJNR()tF;qpc)+^3=5H z`i;KV7w^KNq)ZCg#qVL#7<)w+gnTciunLlXh4|m*wQ;0937gf!M=nQD3S02p16G7d z9PyRcSG9&obx868zdW+*t|H?ekfE+CnMAs@y@3LYcG$mqJR8w z#!J3RPr}0rw}s74lZj+jZI@=d>0PmSjw|rr4~spa9GG1(@0v>{yk|-?4aS?Z6N@M> z&Nep{SGMLH8+B~7*C+KNX}6};SFn0YwE5|@q9h8Oo|+)rNy?PKp=CY+*@Eb6J*F1{F4hctsI6EgbX?5hFck z%)i4OA=)wYDg_jDEtEXcqtV*A2mMYvX_|=vJPBpsLD6gxXF(lCM0^4%nzx@@ubXRH zw9?@9xya6PBOtXRB6YM|d)qD@WYX%I!u_|s=yKP+V}t`o!$ENkN#(+zMW+FiS~yHE zVsD)q%SOqmfWTH}wmchw6N}tM>7zNe*C2j+?8g}`t$aW@pp@a$z^odTqB zM-k7FTqENrZN1Lq25MLwJ&roRWX?LjnBYsHTX*tuGvf_|i#0)-*W${*p3z6E0H=0d z8^4fL$3&L?vXZ*r!6z5J?SNws!;=o+)u+Vd&c4nT?BI7XZ5nsb*T zcwonZLwt?1QwHHaI&gOo@qh*3H`WqB-b5P@y4gN{4FLJ|cu>xpm58&Ta;IpS6F}Zd zX|UfZ!?8~YyM(c44D%KR2oD^MNQ5X6rR4P}7a;4`ie&mI>vo} zBfSFK&p;3ADC5mTd5aa;lIQ;$1ZDNa?d-97&pPUH{oHC~dH;2Q@pq|8AA-+4zcbf3 zkBV=r?x*^o2U#WDA({SBj^bWds0iHV63|H1i*IyB1dYL&O$&OLWUr5W$N2Z9plNgZNO*2NOh`7DK8BQHkHTk=WwLoai z012%LXGB^K*r5xWY(KvY{)-*@^U1XX&GWnHuYe0#pW;iO;B*RK<35R?Gk@8WbEV~^ z@Zt<{sUi)!6N%~3q(eTC)>|AQH!t)%Q6;bDxj(>Bd7}Ir9J4n`Oi!}6RLLE=X%f>D zkvvyEP%Y_>q`N&2L(}0;@=t6U&VNuN;gbSIpK$&M<4Gzf`9R|Yt-_Nyojm+UQt~$) z#r#ng3vvD0)Z~*N_5x-J3ny}caOdy*9j|K~p6DkDaWD>^g%>S=j1PAUZkbt(bTlrPDq2&V4jsJZDdLa4a&UD^OOwRxXr{i{N9i6`F_7=t^2)N z>vz{(cdfgA=dW|tDYZZQ`8?0_evO8>YT;-c+cM8-kTD`?^CsYg_~`6;aLS%!I<7Vw z+4q_Exyt8jaNr~OunhD*2{7x{f3PvogIcp$XgSkPgJO^rh}6l=5DU;mM1Y;ss}9%^ zl(&=vE88d>{wKi-7QX+N@d5IO>r<}q0?_A=3fzzu>bRbkLk1X04U3kim20Tz71<$^ zE*j@LiDF{Tl(L!1$w3MD3lOp?c4^TZ2Q-xnrHYU))5y7qZP_KW*w2K_yizUy`0qWa zGeK*se}0Zdot$C-$N08rG?{^cL9|6b_ckBgTZJf(mR}((R6Rbu6B35o&<#B(%D9eh z!|ia>mb`ti4&&F}1jw8e?Y!g76Gi6IhREkAy_?x+V+3>PGC+?VReM}fUvQpw83vrI zijN*#=sbuyk*hdaL_Dekcyx`zeGQIhpS|U`P~(@U{McOjUz`PsNUH@bz^|*o56p{- z$Me#9%OMuv^!u*ZSL+~jtc)FiO!fi@a9=3=^RTM~=k6XN@J@48n}Wy?H=Q~MN$ASa z_*ECw>HsQceoeqXlA~oNKW{}Cf*5I?0$4`-i@k|2{PoDsyO9pc+jcZ6}5amiojB5pLQJGsv~g1<#2i`4FU7 zllH?CVPH75*?Chb^>xJ)28ZXd|E3{S&6WzR!2^Hod%n*ADDA6r+On|9@cM6=n2AyO z?+(%1{Dn)PV3EnrFE;x(yy%A!T>aJ(X8phmM-KxuX{ucfL_*XUMDGlK;=eiva;*x` zuMfas6NCnNc=rYX?>LziX^iCh{h5{gPx95z3$#BdU|&;3qK_(Iu7ql@fGrzo!a)(J z4t)XGwN%umGeatV-=HR0e!F1YTy1T-QUFy(l!51a05WcZ0RS|1>hW;KzX%NI%4%;7 z$_x2n0Gr>Crq^mJ)S3Ng#v2ES+O+rU4te;oJB6Zh6J(078NW3TpspKn7z6VLyQmQZ z3Us$0it`JPjck6wBTN4YC5oanNwjgneNtl;%DpRKG~UysbYJ80QM)LVS_ZmoQf}fSA4VOSj3>SB_Grb82u1yA6Zi(3z^`q_ z;NcwuzH6X6_4)z&&xUutPhvqkZ@-=^IbCfqz$6|F~gx(yD`zQd4lt~oZ=dcDsb{quMoeh?xCL;10o*V{SWH6StDi4+zlyA2&Wx;|BWG$gWhmxTK5UFfJiCpO;;F5 zt381&ySci&XkH8l#57oOkRz-r<3TZCi)ZqHI;hTAQNMWc5n}X+XHDi^C|c*mQO15% zSm+pK&*CXS+~+JFj)7U?IS|Z}O$>0n%`P%X3~`YnsY@KF`HhGgxv;_NAA?MFbDjDy z_u2gUHLz`@hw&&+)U}1U&X4gxQ+e}wYyo^ls-X(lmT6m1K#KH0ZnWj(EdWl{wA#~% zUr`9a@rMaLF{q|I&yOqc{EJ#OKyBf$mh2T==^omtlJNo5`r(Wb#sU+@1J)Y3bRGa@@g~bmeH}fkY_@0oCeB+ z;*KcvAtiF3kA2$cvwe||^Gm~r%Wwf4h|%0_6?~+oQ0H_-UJaeKOPb!7L!@xBZsk{( zv<{CC_t|#yTI|jED(2;Zj5A5m+Pk3OXdu~BSGiBT2@+X^?GZcdMlhp0g2I__)rF?j zKl|Up9i`ije*yBw@)G;5Q+-}Ao{5PLO@Qg1#~z&5 zb4fmgK3mkNeG=G{Gk(Z4be=)Hq^RG$z3+x^Mn7==Oh%8%04h2KGdCX8nwwcr9*qRW zIh$kS<$PWpjf$D7{IT$TcLaN=f`16g)2=a1ywtjC;*q15qIqy~?e_B1BG_SG7aO=W z^gLaiuWSb@=!FtTxy4$=h$IqUWTDsgY{!mU8;gU^aT&tixrZ%4+gqbq)A|OV>{au3 z!*p^UP^7A{*N1E>3bV?25=)j-wr9^s#?eQp^dou(u=Y`-bZ9-Iwzl z$H~Bz2Wk}T!cr3fhJ+wJ%eKsfB)%xjFqDBzNA9?lo}H~t7}!(o@s`>}>M6Gi0EKiZ zx4!z&A})EG(b+7W;ZstNtRaW=bQT{KRU8U@Lcx(*w;kUuS$y^9TVT2|h4>uREU3_skYc8pzt;SOERL|W`wd~U&=)&IbmuMJA{io<99GCpi& z5IYAWFFm{pVb+vPeJ} zw%N?^(9FRpQ3)K{gv-~vSf6@s(wN!YS5_aSB(L5n1r+hL)Ej>3-{7Atcj~Yqi9^Ak z(NqnMFyz`J#f0<%6?^|#nkHZlYF>J@7Xy4&4p+JTjaO>K%>jKW@j&VkXv5Y7t}vM5J#7vm+qU4^lyV*J87 z_1Vc_zZDQ174)4DB`$p17O?;X`JrWKvFM<0p0YN@F zQgUhNjoA->7NT7!FpZ3^aF0!Zq#rf>IJ*b6$NqJyBVIj5W->t=Ai34dAjRMVfs05ccaKG(fWbIS}?HhHJVw75HQT zUwI9F7E#Dgbie&t%rYN5<4fRfKddu(Y#R2xFjK2xks1&E+tdB!W}?@Oq=7qlT>-zZ zU5iLJB;RFPy=qF%2>$sPYN(pWz*4mmO(C>dNzDQ|XGa{BbgT zsE4bIxv>lVa790&bGlm0pz@+3%h$#>z%N_!#81=; zfqaSNT1$l4GD^eO2-0g|WX8Dxp7eUj4k$TA0?Xm5ML!PLvQFD->%Dc8uL{y&7?yNY z4L$t@R}BWHyhKSD?B@(~N87S3U*Tee-)J^U1FLJ7mSkEQUe1&96s*T$SFC=YI&rFh z*GW z@~)C*rOOhf?`|_S$1izX&lkF{@;tZreD1u|K2bPt`Indw$4k!ID#m(Uk8AsqI7o#LC~gE5E6=WTX`SeeCYgPF)GBkYfI|47X;EsEe1*o3Faf z>VDG&ncuERsuNwFSB+iFu5os*)NcwWS|mg~b0c{6z@mQvYnHowH_!5o@@+eXo|^}J z7p##o?cU0I8>)~ZaGs!tJTuifki&tusP~X=T6)F9nbn7++>ta#XQC@EDp_uX$ECX(~q-4st=kUv0S2=lG^=v;m z;6cO<-`Fuqr}B;wf&u81=*OHv)3<(UO?pRuA8Hh{HUn?@TwtBI%I~M_lFeZm6=@!%hg!*_y2xgwLNw%HEily?1U9Y59r4;e4B|7Ns)Gzg}#Y`=-yTapOE!1kfLuZ&eLo zhVuuC_GaS=i1ORuEYui+D>R*l4{q5V0p+kqNDLe6D*GVa`AdFv%(j=vxZ}@i2G{Zk zUiXdZ@R>X&m>BngW_=!dS zdsl!-CjjwT#wtt?7tojRfnNH&Ik3?Kx_1r(tlP<<-hp0t1S~-ElqX-N29Y2Se3ppz zY~KwZ?{fioywhg!LvUZbsS12abFw8!-CrW>+25Qz|9Rh}LppcNE`eQOCo4x@FvX{5 z;d@mDmb|Sok?)@*;J@gQzwf$U?z;iw7C53H3owWkS`><^pa^^(4~ybx?}-N+{;f4? zw?uPk1N`%^W);X2klKl`chs!HvEHoWw*>>uDr%ONmJDtRe<1p3`W1m412refb?3P;Gwr3QDjVcSiE;*`wb9PYVgbG|LwAh<$~8?U&-ld z#tjxaEZK`_|J?KJ#y@}6xmNgnzBYgCRS^NM?eA;MYbP>Paq(Z-7##62hOckP?;*p# zR{Ip3e~)V(6>Pj%>9pWkwXNCdga(vrA*aP7ds`kAVmEC%s&Y{tqkXQ7O2{x450NI*vw>WJe%@O_gBf?u zWrF9vhBzBr_{P46vaoa7jE~pED%*5xPs1oA{}W`cXQUtP^Z6ys^3n?$R{qD*^uPI9 z9{#_!H2oiweLdA)rOknQ;j+qOJ`u{*s4ly56ym-E!PtP6qM3tKtqN$^zNRR^ zggKF_i={=Q1nJv)J4AP_ zH_L)vjdUH!3lWj8vFtd<5)8bS-n5DtgK@?weJA7~j z$BNw!7lmVJPPdABedrJ0hy5uHMocn zZtN9RDn%;zo zXfXUi1d>JGjalnQfnCVf0Qz!@fOu*CsTk@6hX$6|4=~G-^b;v0p%MnTMO{G(YaK~N z(~TDdl9QzaW>b_0fo0N%PUIkbhDoIrOo<5~Ke=2Y@g$I6Ik7hp;SB?cLZNp)Z{Cjb z315?2Hk}73E>iF6*g<2Lfqf{@rYeqD@r!|L=wei=Lgk3P9;v0Fj*`uew|88hQxSLXyzFjAC%Kt{zw-IXqwo zxg=yfK?K-J|uR(A4+p>-I*8hC)3| z*acX(Z`v~{xgf7iKGrvEkXd6(<+z3NdgU@NZ~y+gD}c-fj3mwXg7=W_lgVh7oaJEn zzWq~yIp&RzbPol6wqF{Hai4xf@?7{*_TtFaO1-t~?5I!Ub(&u9Dd(xc1qh!85XQ1CTk+~);uL@`6P@q)hug;{B8v)^i(5|53K-Q~l!km6lwLBx_x(?SdA@wH zS-(Q7*8r|sJdM|{?4^2eSLmlbc>ls>>0?_XgsoKjQNqtx8rH^I#xDpvQKlr0d5DQtRtAll&hjjZtnFEe<}UenwzMXin~Svvz&j? z9&B4UMtQs9xczGyfVmB&K>t?`zj)~kRAW22}4GM z?7|#)4_!}mnx>e77iU5N;d`MSj+QRYjF%QtSt231XfdY>j3iO>IRrPtxZ+&FAUU9* zW_4!XP6A38)Bx#rSc>HU8*qSySVj2L3h{|ua)nS6O%xsv$@yd;U)l?DUVEZ5Sf_9B zowt-tSu%vR!q{PR%qQ zmfwYO_#jTTa z3z5e0W1AiYa6r2Y*)vY(7$nCv=0zVm21Fir)D7#6E1z-how_C9EV4&)oFAs|8-imNcTf(|CIm&Ss zH94RN=m?qqCD?t>i@0Qd(2|iruz^B%Vxhib@tgA7&Gyl|S3KKnMo9p@bJVp}bsD5C zk;q#qFrTs@;yKJRN`GVBB_mDJIS59B{fJ3e`60eat$-c(c6amc+$d&1(v zb@11!X{~O0WV|+>+Y>pOuVvg?hQsaI4z%FhY+bQ-2$5lcK@TeZYAh4hAES4$wEOQ~ zzbKg@V)5#nfI4SRF<1o8E~rF^RO3{=#aFr7Ks1OYXLxHY`IE8D9xL6g9Yb|*q+4QK z23aVY{e&#yv|&RgIW~~`#Vn;iT^1f7!i2SoSwD()l)&W)KCQB(^7Nm6uIrBMudbfK30 zUZvC87Rl<1Ftt5IyQ{yDTEd@pSM6Af`x=;DHoQiOtZ5+ij&6uS#SVcX5m{HGKYLSO z=mZ}acOxVPPh`K59PoWAC3@Vh7Mj4+IfYO3cRT0_JD16Ygk|MUF9km%?E=eaAz3xV z0Jm^?A;&g<5t~qNg^hMGze$V~SKG%bU#Ay6mSHDAg>4z_v>yM6eNjJ(tS@mZOc3Fl;8hG#_#s3URJhxfb@K5Qpfx$O(#6GvW5 zP^814w6i<2OSZrW$wFq<`Q;v_KgdhZaY=1aVDh2Uzf~=JO_8Q?91`E7%;WZWMlpCLZLTW zZ1B|8_x5T7%B_Lvu^SKI?@PjntdR3WSBPyZIkE=1x5wSL&xSeXiBiHn+r7lr7LHqp zUozs?08cYFbJw_^i!8i{MV!_#(_o}e*gEbuhlA`T%R>jm?!+b;Q{3JL`NiQaT`rsi zSHKDM`dy|H-`Fp4@7(4c+O^oMRV`z1D5i62>g(TI6OSNYk#2;JB9{s|>^aGmHVZ^4G-Ch9k;ow1cAIq41G_ z;To7lO6WpbL!@JLPACvgzr>}%euJ1mlCCo$p(`G0ccO9SY`V&!sHT$X1+3r0)T-+;fgilb91)J!KYpzvKIvpqxg=sOdP>xcBDyp-4D(` zB>Ah84S1F-7m+6tVD3T^1Fz8WW^1iz+>%lK#634?e0v=|+&Zf_7iMasqoG6?*f#LH zFF-WcpIuT%|ARHl==|pyq+~IKF*Xy5!ClipMX-2-YE_Cfz4$pir`o}!n3G(+7X44r zp~Rylp6I=IjBcmyWdcL#IDZY8!}OOx|#tU`}-#z8BY33t8FO4xQaT~@)4 ziRsIS@lG7%6K3E`n{Fvy6+Wpk==lRxmq_JOR|&Bt9y+s-*ryGs`T`Huq>>H7iH}V- zLQ56t6Itr_Pj7n&A#^30Nzf!b9R|wbX+=ZY+Gl-Mb)s0`PUIic&3fg+Z;*BA(Mrr{ z^o}*jRunw|!Ra^n!bo{zt5J9P`-i=3G>jQ5*rT(}Z@dI9;Jf=T=)!uKqaBrJY;1AT zocj#F#n5D{X}~_KL_u6>fB7qg;*RUXCjPf^;5*S|L?^o()838kJ?FUAwh`9Oi5dWg zc6ceoHbPkWC06f*)waW!N{67pK>03gb@wk9Ltd^ujsg7p%x$OmpC&C70|s%~Q56dE zwD)C?8Hj)$w9wU2yQgs+D7IYAoT{4X;2#ZN-^$>hzu4FKcq9b>NW|kA2pl99hBK`} zu6+LI%t+yqD3E(Ne64c{hhWmzdr_dOluq&F6=HV8CWidTI@mao!QWsm8?&YrQqeH5 z75OHU-3TmoW4CU2>I=S82x_osg9o)dLYG3wqy(95kBtzwo*_B4yzxT&&doL7x8IeqB7FDUozs}roat94vK1u|X5pp1Y0cw3V# z-&1Bslg#6Uu=j^q?LnI2#CL=5+W}fu-tidfn~kMCwkyWgp!BTbx41?$Q43h<(=z8i zRD1wwbV6^NS}m4^st!W=(%{!?B)w%}j}_+!Vt=h|s-;Hp?#w;vhZW z=!t;+1|`cvFV*EGExw^DPV!2>&r>Rp78G$lMg?_lg`VZ}~T-AHtnI$y5vdT?_{jZ<)%QmXGF$tJy z_lko`;|N^-oxd_T4Pj)N`Sjye#XRgy8a$k9FHG_{&L8(wV&C2c)#lVcmm~0&=Oo2mXD$97ZRkTv z1ub8hJ_xusBR9ncJ?QK_#LV}mM;K1x4$v8HDQ%P`&##obk9@Q!ASY}dcNOX%^b~O$ z7NUQaS2J29^Msf@&@IH6+ycXh1Ww|FP;!e2j2X^nveLcItg!zmn7yG*B`)^j!sl~^ zfQ~CqKnYGla$eek#c%m_kUB-iV`>@+h01SZK4WSUXf52H;rsO8On4e1-)dmJuGxg9)v^MZCB z=mzReT~5LT#`_SM0ep4Xt;i)0y2zKyOT-ic164I*tB8ZM^Omse84wOYFY4c z(|QNW`>@fa=-*&B?+S_XSb5s!I79+atU70^uQXz2DXNK^c_r(j4!7X5N%d>v z@o(^}65@cibpR+=u_XGf1zt1-IN&miVTT z-xwyz*gp~AK{&+MC3I)EI=Um#TMX~t>yt@gYQ)l$k>x0`3Ezo1!IzXZhVqejN7*I8 zqj;9i|BA#wKdf>=A}gdzhoHVjkYP>B#6Fr1p&d!5a75YhvPZy-lMgk?H-gCxEbkne z&n^f{-ofUw?486ds4wIMn7B{*8kM+@IG?e2B!$;oDIzskYb@OX%|zz}qtP=|g(o6d zm{)AO6!9b{uku|lAT8CY80CW_J55-18W+NzJ@v$*azx!L5i{haSHmV@LAc!jv+^U@ zdo(H@^M~yv6Tz$}=!drXUPT!!f=2>+w6UtR;R=_&ZD$>OkCmMcY3Pd$!#gA*k%f5f z=qX88A}39qo|n*2{1;w&fsg4{cNEesKYsD(E>5ppNWe9i{eqK;xz!v(&X2&%i3@*{ zYx`EJxJwrnIJU9==gS`Xnni3mENl35o`E?=#vvsQX5-5d4>Ay9)$}a5!Ojg5l~xo$ zf*MOt5g})RqvTm$Y2FA6(E-WFZSMN#W6F zkD{^x(|}#ghoDPq(>6PY1-AI|gRw28*66Oxh7Zx}*Z$<#0NWK(0bJWmWaG258{G#x zWRblK-4Fo|jt7W!J;qChGR8D>W;P@Br&nH&{{2gYpD)j}0?c6=oFSmQnJUBF36(Bf zf`KOi6l&|;-SF@(QJxr^W0GJ5Ct)~zt z-65&Nb%}7=-z5=U=usRTkVyvhT^HsSQ(0_F^zCaKid^-)&eRp@EBA+4>FDy_iA-&a z^T$IgK(z<5@7G{`FO|?ef4J{1lLs+Vgty<+a~H^-?TfU)t#{(x!@NXOt7<83)SHe> zUWut_G|7>Vv7o?_KtL9Yf#z33^g6$l2?GqOZXldxw0gjm5qw>+ryAmnv>}}M2xyLQ zRky}_eD;mJ0!piT`PeLlUEL(75Sq@zd$eF8tG1PUT6s&rTuJWE7rNvyW0*!N8zS!Z zyjtq20|1cuR!bTmWHx%1Pq~8afFOCd3`wBeAjBf^iEnZ65HEFNhUX z(NoUn)~|wdL@e&-=Eab5el zwHLAnX*{H3Zq@@?igcEYiDk!;9S}r0tlLJPjw}MkG$&o}z$5Er`U%NqXX2DLx=Vt0 z5v3nc&ctr#_gZHYoyJZ8a9|g;f?EK(D;Q=W9mr9emQ~O6I~NX-nei-vBXBJk2dPWF z+9ltOe>fF9r<6_98P7ScLy7HK>(=*YIfow}d4@WAQvN4E_K?-f`dn literal 0 HcmV?d00001 diff --git a/tensorrt_llm/tools/profiler/nsys_profile_tools/trtllm_engine_model.json b/tensorrt_llm/tools/profiler/nsys_profile_tools/trtllm_engine_model.json new file mode 100644 index 0000000000..9287a6d9c6 --- /dev/null +++ b/tensorrt_llm/tools/profiler/nsys_profile_tools/trtllm_engine_model.json @@ -0,0 +1,62 @@ +{ + "trtllm": { + "llama": { + "Fused_Moe_Kernel|gemm::|fused_moe|bmm_|GemmUniversal": "moe_gemm", + "gemm|nvjet_": "gemm", + "moe|Expert|Moe": "moe", + "CatArrayBatched": "prepare_next", + "ncclDevKernel|AllReduce": "nccl_and_custom_ar", + "RMSNormKernel": "norm", + "topk": "topk", + "act_and_mul_|Activation": "activation", + "Rotary": "rope", + "SoftMax": "softmax", + "flash|splitKreduce|kernel_mha|mmha|fmha": "attn", + "elementwise": "elementwise", + "Quantize|cvt_": "quantize", + "reduce_kernel": "reduce", + "triton": "triton_kernel", + "CUDA mem": "non-gpu-H_D_memops", + ".*": "misc" + }, + "ds": { + "fp8_blockscale_gemm": "block_fp8_gemm", + "gemm::GroupProblemShape|Fused_Moe_Kernel|bmm_": "moe_gemm", + "gemm|matmul|nvjet|gemvx": "gemm", + "moe|buildExpertMaps|Moe|Expert|Moe": "moe", + "CatArrayBatched": "prepare_next", + "ncclDevKernel|cross_device_reduce|AllReduce": "nccl_and_custom_ar", + "Norm|_norm_": "norm", + "topk": "topk", + "act_and_mul_|Activation": "activation", + "Rope": "rope", + "elementwise": "elementwise", + "fmha|flash_fwd_kernel": "attn", + "Quantize|fp8_quant|quant_fp8|cvt_": "quantize", + "reduce": "reduce", + "SoftMax": "softmax", + "CUDA mem": "non-gpu-H_D_memops", + ".*": "misc" + }, + "gpt-oss": { + "block_fp8|gemm_fp8_blockwise": "block_fp8_gemm", + "fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_|matmul_ogs_|_topk_forward|_combined_routing|_sum_bitmatrix_rows|_compute_writeback_idx": "moe_gemm", + "gemm|matmul|nvjet": "gemm", + "moe|sigmoid|expert|splitKreduce|Moe": "moe", + "CatArrayBatched": "prepare_next", + "ncclDevKernel|cross_device_reduce|AllReduce": "nccl_and_custom_ar", + "Norm|_norm_": "norm", + "sbtopk": "topk", + "act_and_mul_|Activation": "activation", + "Rope": "rope", + "elementwise": "elementwise", + "fp8_quant|quant_fp8|cvt_": "quantize", + "reduce": "reduce", + "SoftMax": "softmax", + "fmha|mha|flash_fwd_kernel": "attn", + "triton": "triton_kernel", + "CUDA mem": "non-gpu-H_D_memops", + ".*": "misc" + } + } +}