TensorRT-LLMs/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceCommon.h
dongxuy04 4f0f17ac8a
feat: Misc Opt for large scale EP (#5374)
Signed-off-by: Dongxu Yang <78518666+dongxuy04@users.noreply.github.com>
2025-06-20 13:11:31 +08:00

93 lines
3.4 KiB
C++

/*
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
namespace tensorrt_llm
{
namespace kernels
{
struct MoeLoadBalanceSingleLayerSignal
{
static constexpr unsigned long long kGPU = 0ULL;
static constexpr unsigned long long kCPU = 1ULL;
static constexpr unsigned long long kDevice = 1ULL;
static constexpr unsigned long long kSkipStep = 1ULL << 1U;
static constexpr unsigned long long kDisabled = 1ULL << 63U;
// Bit 0 means the current owner of this layer, 0: gpu, 1: cpu, updated by cpu and gpu alternately
// Bit 1 means whether skip statistic for current step, cpu set that at one iteration start,
// maybe with or without ownership, but since forward is not started, so no conflict.
// Bits 2-62 means the current step, updated by cpu after one iteration with cpu ownership
// Bit 63 means if step update is disabled, 0: not disabled, 1: disabled, updated by cpu
unsigned long long int volatile stepAndOwner;
};
struct MoeLoadBalanceMetaInfo
{
// Model Layer Info
int expertCount;
int topK;
// Parallelism Info
int epRank;
int epSize;
// Slot Info
int slotCountPerRank;
};
struct MoeLoadBalanceStatisticInfo
{
// Statistic Info
// expertLoadFactor[i] means the load factor of expert i
// The length of expertLoadFactor should be expertCount
float* expertLoadFactor = nullptr;
// expertTokenCount[i] means the number of tokens of expert i
// The length of expertTokenCount should be rawDataWindowSize * expertCount
int* expertTokenCount = nullptr;
// rawDataWindowSize means the size of the raw data window.
// e.g. how many steps of raw data are kept in the memory.
// current we keep only the data in current iteration, previous should sum to expertLoadFactor.
static constexpr int rawDataWindowSize = 1;
// decayFactor means the decay factor of the raw data per step.
// e.g. if decayFactor is 0.95, then the raw data of expert i will be decayed by 0.95 for each step.
float decayFactor = 0.95f;
};
// The placement information for GPU
struct MoePlacementInfo
{
// Placement Info
// expertReplicaCount[i] means the number of replicas of expert i
int* expertReplicaCount = nullptr;
// expertReplicaStartOffset[i] means the start offset of expert i's replicas in globalSlotIds
// and the values of globalSlotIds[expertReplicaStartOffset[i]] ~ globalSlotIds[expertReplicaStartOffset[i] +
// expertReplicaCount[i] - 1] are possible globalSlotId for expert i, and can be dispatched to any one.
int* expertReplicaStartOffset = nullptr;
// globalSlotIds[i] means the global slot id for expert i
// The length of globalSlotIds should be epSize * slotCountPerRank
int* globalSlotIds = nullptr;
};
} // namespace kernels
} // namespace tensorrt_llm