[TRTLLM-9465][fix] Swap TP-CP grouping order (#10350)

Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com>
This commit is contained in:
Balaram Buddharaju 2026-01-05 04:08:03 -08:00 committed by GitHub
parent 3749a2ce1c
commit a792c23dcf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 269 additions and 102 deletions

View File

@ -104,12 +104,14 @@ public:
[[nodiscard]] SizeType32 constexpr getTensorParallelRank() const noexcept [[nodiscard]] SizeType32 constexpr getTensorParallelRank() const noexcept
{ {
return mRank % mTensorParallelism; // Layout: pp is outermost, then tp, then cp is innermost (consecutive).
return (mRank % (mTensorParallelism * mContextParallelism)) / mContextParallelism;
} }
[[nodiscard]] SizeType32 constexpr getContextParallelRank() const noexcept [[nodiscard]] SizeType32 constexpr getContextParallelRank() const noexcept
{ {
return (mRank % (mTensorParallelism * mContextParallelism)) / mTensorParallelism; // Layout: pp is outermost, then tp, then cp is innermost (consecutive).
return mRank % mContextParallelism;
} }
[[nodiscard]] SizeType32 constexpr getLocalRank() const noexcept [[nodiscard]] SizeType32 constexpr getLocalRank() const noexcept

View File

@ -154,7 +154,8 @@ bool CacheFormatter::needSendCache(
return true; return true;
} }
int selfTpRank = selfIdx % selfConfig.getParallelConfig().mTensorParallelism; int selfCpSize = selfConfig.getParallelConfig().mContextParallelism;
int selfTpRank = (selfIdx % (selfConfig.getParallelConfig().mTensorParallelism * selfCpSize)) / selfCpSize;
int selfTpRankInDpGroup = selfTpRank; int selfTpRankInDpGroup = selfTpRank;
if (selfConfig.getParallelConfig().mEnableAttentionDP) if (selfConfig.getParallelConfig().mEnableAttentionDP)
{ {

View File

@ -60,7 +60,8 @@ std::vector<size_t> MLACacheFormatter::pickRecvConnections(
bool MLACacheFormatter::needSendCache( bool MLACacheFormatter::needSendCache(
CacheState const& selfConfig, CacheState const& destConfig, runtime::SizeType32 selfIdx) CacheState const& selfConfig, CacheState const& destConfig, runtime::SizeType32 selfIdx)
{ {
int selfTpRank = selfIdx % selfConfig.getParallelConfig().mTensorParallelism; int selfCpSize = selfConfig.getParallelConfig().mContextParallelism;
int selfTpRank = (selfIdx % (selfConfig.getParallelConfig().mTensorParallelism * selfCpSize)) / selfCpSize;
int destTPNumInDPGroup = destConfig.getParallelConfig().mEnableAttentionDP int destTPNumInDPGroup = destConfig.getParallelConfig().mEnableAttentionDP
? destConfig.getParallelConfig().mTensorParallelism / destConfig.getParallelConfig().mDPsize ? destConfig.getParallelConfig().mTensorParallelism / destConfig.getParallelConfig().mDPsize

View File

@ -107,9 +107,9 @@ TargetRanksInfo TargetRanksInfoForDP(
auto const peerCPNum = peerParConfig.mContextParallelism; auto const peerCPNum = peerParConfig.mContextParallelism;
auto const selfCPNum = selfParConfig.mContextParallelism; auto const selfCPNum = selfParConfig.mContextParallelism;
auto const selfTPRank = selfRank % selfTPNum; auto const selfCPRank = selfRank % selfCPNum;
auto const selfTPRank = (selfRank % (selfTPNum * selfCPNum)) / selfCPNum;
auto const selfPPRank = selfRank / (selfTPNum * selfCPNum); auto const selfPPRank = selfRank / (selfTPNum * selfCPNum);
auto const selfCPRank = (selfRank % (selfTPNum * selfCPNum)) / selfTPNum;
int peerPPRankStart = 0; int peerPPRankStart = 0;
int mDomainPPSize = 1; int mDomainPPSize = 1;
@ -205,13 +205,14 @@ TargetRanksInfo TargetRanksInfoForDP(
} }
std::vector<int> retRanks; std::vector<int> retRanks;
for (int i = peerTPRankStart; i < peerTPRankEnd; i++) for (int i = peerCPRankStart; i < peerCPRankEnd; i++)
{ {
for (int j = peerCPRankStart; j < peerCPRankEnd; j++) for (int j = peerTPRankStart; j < peerTPRankEnd; j++)
{ {
for (int k = peerPPRankStart; k < peerPPRankEnd; k++) for (int k = peerPPRankStart; k < peerPPRankEnd; k++)
{ {
int irank = (k * peerTPNum * peerCPNum) + (j * peerTPNum) + i; // Rank formula: ppRank * (tpNum * cpNum) + tpRank * cpNum + cpRank.
int irank = (k * peerTPNum * peerCPNum) + (j * peerCPNum) + i;
retRanks.push_back(irank); retRanks.push_back(irank);
} }
} }

View File

@ -142,6 +142,9 @@ WorldConfig WorldConfig::mpi(SizeType32 gpusPerNode, std::optional<SizeType32> t
std::vector<SizeType32> WorldConfig::getPipelineParallelGroup() const std::vector<SizeType32> WorldConfig::getPipelineParallelGroup() const
{ {
// Layout: pp is outermost, then tp, then cp is innermost (consecutive).
// rank = ppRank * (tp * cp) + tpRank * cp + cpRank
// PP group: all ranks with same (tpRank, cpRank) but different ppRank.
auto const pp = getPipelineParallelism(); auto const pp = getPipelineParallelism();
auto const tp = getTensorParallelism(); auto const tp = getTensorParallelism();
auto const cp = getContextParallelism(); auto const cp = getContextParallelism();
@ -157,29 +160,35 @@ std::vector<SizeType32> WorldConfig::getPipelineParallelGroup() const
std::vector<SizeType32> WorldConfig::getTensorParallelGroup() const std::vector<SizeType32> WorldConfig::getTensorParallelGroup() const
{ {
// Layout: pp is outermost, then tp, then cp is innermost (consecutive).
// rank = ppRank * (tp * cp) + tpRank * cp + cpRank
// TP group: all ranks with same (ppRank, cpRank) but different tpRank.
auto const tp = getTensorParallelism(); auto const tp = getTensorParallelism();
auto const cp = getContextParallelism();
auto const rank = getRank(); auto const rank = getRank();
auto const tpRank = getTensorParallelRank(); auto const tpRank = getTensorParallelRank();
std::vector<SizeType32> group; std::vector<SizeType32> group;
group.reserve(tp); group.reserve(tp);
for (SizeType32 idx = 0; idx < tp; idx++) for (SizeType32 idx = 0; idx < tp; idx++)
{ {
group.push_back(rank - tpRank + idx); group.push_back(rank - tpRank * cp + idx * cp);
} }
return group; return group;
} }
std::vector<SizeType32> WorldConfig::getContextParallelGroup() const std::vector<SizeType32> WorldConfig::getContextParallelGroup() const
{ {
// Layout: pp is outermost, then tp, then cp is innermost (consecutive).
// rank = ppRank * (tp * cp) + tpRank * cp + cpRank
// CP group: all ranks with same (ppRank, tpRank) but different cpRank.
auto const cp = getContextParallelism(); auto const cp = getContextParallelism();
auto const tp = getTensorParallelism();
auto const pp = getPipelineParallelism();
auto const rank = getRank(); auto const rank = getRank();
auto const cpRank = getContextParallelRank();
std::vector<SizeType32> group; std::vector<SizeType32> group;
group.reserve(cp); group.reserve(cp);
for (SizeType32 idx = 0; idx < cp; idx++) for (SizeType32 idx = 0; idx < cp; idx++)
{ {
group.push_back(rank + cp % (tp * pp)); group.push_back(rank - cpRank + idx);
} }
return group; return group;
} }

View File

@ -2029,16 +2029,16 @@ TEST(targetTest, CacheStateNODP)
tr::WorldConfig const contextWC{/*tpSize*/ 2, /*ppSize*/ 2, /*cpSize*/ 1}; tr::WorldConfig const contextWC{/*tpSize*/ 2, /*ppSize*/ 2, /*cpSize*/ 1};
tr::WorldConfig const genWC{/*tpSize*/ 2, /*ppSize*/ 2, /*cpSize*/ 2}; tr::WorldConfig const genWC{/*tpSize*/ 2, /*ppSize*/ 2, /*cpSize*/ 2};
verifyContext( verifyContext(
/*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2}, /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 1},
/*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 3}, /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 3},
/*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {4, 6}, /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {4, 5},
/*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {5, 7}, /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {6, 7},
/*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
} }
@ -2047,19 +2047,19 @@ TEST(targetTest, CacheStateNODP)
tr::WorldConfig const contextWC{/*tpSize*/ 2, /*ppSize*/ 2, /*cpSize*/ 1}; tr::WorldConfig const contextWC{/*tpSize*/ 2, /*ppSize*/ 2, /*cpSize*/ 1};
tr::WorldConfig const genWC{/*tpSize*/ 4, /*ppSize*/ 2, /*cpSize*/ 2}; tr::WorldConfig const genWC{/*tpSize*/ 4, /*ppSize*/ 2, /*cpSize*/ 2};
verifyContext( verifyContext(
/*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 4, 1, 5}, /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2, 1, 3},
/*expectPPDomain*/ 1, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 6, 3, 7}, /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {4, 6, 5, 7},
/*expectPPDomain*/ 1, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {8, 12, 9, 13}, /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {8, 10, 9, 11},
/*expectPPDomain*/ 1, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {10, 14, 11, 15}, /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {12, 14, 13, 15},
/*expectPPDomain*/ 1, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
} }
@ -2069,16 +2069,16 @@ TEST(targetTest, CacheStateNODP)
tr::WorldConfig const contextWC{/*tpSize*/ 4, /*ppSize*/ 1, /*cpSize*/ 1}; tr::WorldConfig const contextWC{/*tpSize*/ 4, /*ppSize*/ 1, /*cpSize*/ 1};
tr::WorldConfig const genWC{/*tpSize*/ 2, /*ppSize*/ 1, /*cpSize*/ 2}; tr::WorldConfig const genWC{/*tpSize*/ 2, /*ppSize*/ 1, /*cpSize*/ 2};
verifyContext( verifyContext(
/*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2}, /*expectPPDomain*/ 1, /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 1}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2}, /*expectPPDomain*/ 1, /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 1}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ false); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ false);
verifyContext( verifyContext(
/*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 3}, /*expectPPDomain*/ 1, /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 3}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 3}, /*expectPPDomain*/ 1, /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 3}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ false); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ false);
} }
@ -2087,19 +2087,19 @@ TEST(targetTest, CacheStateNODP)
tr::WorldConfig const contextWC{/*tpSize*/ 2, /*ppSize*/ 2, /*cpSize*/ 1}; tr::WorldConfig const contextWC{/*tpSize*/ 2, /*ppSize*/ 2, /*cpSize*/ 1};
tr::WorldConfig const genWC{/*tpSize*/ 2, /*ppSize*/ 4, /*cpSize*/ 2}; tr::WorldConfig const genWC{/*tpSize*/ 2, /*ppSize*/ 4, /*cpSize*/ 2};
verifyContext( verifyContext(
/*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 4, 2, 6}, /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 4, 1, 5},
/*expectPPDomain*/ 2, /*expectPPDomain*/ 2,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 5, 3, 7}, /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 6, 3, 7},
/*expectPPDomain*/ 2, /*expectPPDomain*/ 2,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {8, 12, 10, 14}, /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {8, 12, 9, 13},
/*expectPPDomain*/ 2, /*expectPPDomain*/ 2,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {9, 13, 11, 15}, /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {10, 14, 11, 15},
/*expectPPDomain*/ 2, /*expectPPDomain*/ 2,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
} }
@ -2109,28 +2109,28 @@ TEST(targetTest, CacheStateNODP)
tr::WorldConfig const contextWC{/*tpSize*/ 2, /*ppSize*/ 4, /*cpSize*/ 1}; tr::WorldConfig const contextWC{/*tpSize*/ 2, /*ppSize*/ 4, /*cpSize*/ 1};
tr::WorldConfig const genWC{/*tpSize*/ 2, /*ppSize*/ 2, /*cpSize*/ 2}; tr::WorldConfig const genWC{/*tpSize*/ 2, /*ppSize*/ 2, /*cpSize*/ 2};
verifyContext( verifyContext(
/*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2}, /*expectPPDomain*/ 1, /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 1}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 3}, /*expectPPDomain*/ 1, /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 3}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2}, /*expectPPDomain*/ 1, /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 1}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 3}, /*expectPPDomain*/ 1, /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 3}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 4, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {4, 6}, /*expectPPDomain*/ 1, /*contextRank*/ 4, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {4, 5}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 5, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {5, 7}, /*expectPPDomain*/ 1, /*contextRank*/ 5, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {6, 7}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 6, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {4, 6}, /*expectPPDomain*/ 1, /*contextRank*/ 6, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {4, 5}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 7, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {5, 7}, /*expectPPDomain*/ 1, /*contextRank*/ 7, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {6, 7}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
} }
@ -2139,28 +2139,28 @@ TEST(targetTest, CacheStateNODP)
tr::WorldConfig const contextWC{/*tpSize*/ 4, /*ppSize*/ 2, /*cpSize*/ 1}; tr::WorldConfig const contextWC{/*tpSize*/ 4, /*ppSize*/ 2, /*cpSize*/ 1};
tr::WorldConfig const genWC{/*tpSize*/ 2, /*ppSize*/ 1, /*cpSize*/ 2}; tr::WorldConfig const genWC{/*tpSize*/ 2, /*ppSize*/ 1, /*cpSize*/ 2};
verifyContext( verifyContext(
/*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2}, /*expectPPDomain*/ 1, /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 1}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2}, /*expectPPDomain*/ 1, /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 1}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ false); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ false);
verifyContext( verifyContext(
/*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 3}, /*expectPPDomain*/ 1, /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 3}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 3}, /*expectPPDomain*/ 1, /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 3}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ false); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ false);
verifyContext( verifyContext(
/*contextRank*/ 4, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2}, /*expectPPDomain*/ 1, /*contextRank*/ 4, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 1}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 5, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2}, /*expectPPDomain*/ 1, /*contextRank*/ 5, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 1}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ false); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ false);
verifyContext( verifyContext(
/*contextRank*/ 6, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 3}, /*expectPPDomain*/ 1, /*contextRank*/ 6, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 3}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 7, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 3}, /*expectPPDomain*/ 1, /*contextRank*/ 7, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 3}, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ false); /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ false);
} }
@ -2169,19 +2169,19 @@ TEST(targetTest, CacheStateNODP)
tr::WorldConfig const contextWC{/*tpSize*/ 2, /*ppSize*/ 2, /*cpSize*/ 1}; tr::WorldConfig const contextWC{/*tpSize*/ 2, /*ppSize*/ 2, /*cpSize*/ 1};
tr::WorldConfig const genWC{/*tpSize*/ 4, /*ppSize*/ 1, /*cpSize*/ 2}; tr::WorldConfig const genWC{/*tpSize*/ 4, /*ppSize*/ 1, /*cpSize*/ 2};
verifyContext( verifyContext(
/*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 4, 1, 5}, /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2, 1, 3},
/*expectPPDomain*/ 1, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 6, 3, 7}, /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {4, 6, 5, 7},
/*expectPPDomain*/ 1, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 4, 1, 5}, /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2, 1, 3},
/*expectPPDomain*/ 1, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
verifyContext( verifyContext(
/*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 6, 3, 7}, /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {4, 6, 5, 7},
/*expectPPDomain*/ 1, /*expectPPDomain*/ 1,
/*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); /*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true);
} }

View File

@ -56,3 +56,139 @@ TEST(WorldConfig, DeviceIds)
EXPECT_NO_THROW(tr::WorldConfig(tensorParallelism, pipelineParallelism, contextParallelism, rank, gpusPerNode, EXPECT_NO_THROW(tr::WorldConfig(tensorParallelism, pipelineParallelism, contextParallelism, rank, gpusPerNode,
std::vector{0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12})); std::vector{0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12}));
} }
// Test for parallel rank calculations and group membership.
// Layout: pp is outermost, then tp, then cp is innermost (consecutive).
// rank = ppRank * (tp * cp) + tpRank * cp + cpRank
TEST(WorldConfig, ParallelRanks)
{
auto constexpr tp = 2;
auto constexpr pp = 2;
auto constexpr cp = 2;
auto constexpr gpusPerNode = 16;
// Test all 8 ranks in a tp=2, pp=2, cp=2 configuration.
// Rank 0: ppRank=0, tpRank=0, cpRank=0
{
tr::WorldConfig config(tp, pp, cp, 0, gpusPerNode);
EXPECT_EQ(config.getPipelineParallelRank(), 0);
EXPECT_EQ(config.getTensorParallelRank(), 0);
EXPECT_EQ(config.getContextParallelRank(), 0);
}
// Rank 1: ppRank=0, tpRank=0, cpRank=1
{
tr::WorldConfig config(tp, pp, cp, 1, gpusPerNode);
EXPECT_EQ(config.getPipelineParallelRank(), 0);
EXPECT_EQ(config.getTensorParallelRank(), 0);
EXPECT_EQ(config.getContextParallelRank(), 1);
}
// Rank 2: ppRank=0, tpRank=1, cpRank=0
{
tr::WorldConfig config(tp, pp, cp, 2, gpusPerNode);
EXPECT_EQ(config.getPipelineParallelRank(), 0);
EXPECT_EQ(config.getTensorParallelRank(), 1);
EXPECT_EQ(config.getContextParallelRank(), 0);
}
// Rank 3: ppRank=0, tpRank=1, cpRank=1
{
tr::WorldConfig config(tp, pp, cp, 3, gpusPerNode);
EXPECT_EQ(config.getPipelineParallelRank(), 0);
EXPECT_EQ(config.getTensorParallelRank(), 1);
EXPECT_EQ(config.getContextParallelRank(), 1);
}
// Rank 4: ppRank=1, tpRank=0, cpRank=0
{
tr::WorldConfig config(tp, pp, cp, 4, gpusPerNode);
EXPECT_EQ(config.getPipelineParallelRank(), 1);
EXPECT_EQ(config.getTensorParallelRank(), 0);
EXPECT_EQ(config.getContextParallelRank(), 0);
}
// Rank 5: ppRank=1, tpRank=0, cpRank=1
{
tr::WorldConfig config(tp, pp, cp, 5, gpusPerNode);
EXPECT_EQ(config.getPipelineParallelRank(), 1);
EXPECT_EQ(config.getTensorParallelRank(), 0);
EXPECT_EQ(config.getContextParallelRank(), 1);
}
// Rank 6: ppRank=1, tpRank=1, cpRank=0
{
tr::WorldConfig config(tp, pp, cp, 6, gpusPerNode);
EXPECT_EQ(config.getPipelineParallelRank(), 1);
EXPECT_EQ(config.getTensorParallelRank(), 1);
EXPECT_EQ(config.getContextParallelRank(), 0);
}
// Rank 7: ppRank=1, tpRank=1, cpRank=1
{
tr::WorldConfig config(tp, pp, cp, 7, gpusPerNode);
EXPECT_EQ(config.getPipelineParallelRank(), 1);
EXPECT_EQ(config.getTensorParallelRank(), 1);
EXPECT_EQ(config.getContextParallelRank(), 1);
}
}
TEST(WorldConfig, ParallelGroups)
{
auto constexpr tp = 2;
auto constexpr pp = 2;
auto constexpr cp = 2;
auto constexpr gpusPerNode = 16;
// Test group membership for rank 3 (ppRank=0, tpRank=1, cpRank=1).
// CP group: all ranks with same (ppRank=0, tpRank=1) = [2, 3].
// TP group: all ranks with same (ppRank=0, cpRank=1) = [1, 3].
// PP group: all ranks with same (tpRank=1, cpRank=1) = [3, 7].
{
tr::WorldConfig config(tp, pp, cp, 3, gpusPerNode);
auto cpGroup = config.getContextParallelGroup();
auto tpGroup = config.getTensorParallelGroup();
auto ppGroup = config.getPipelineParallelGroup();
EXPECT_EQ(cpGroup, (std::vector<tr::SizeType32>{2, 3}));
EXPECT_EQ(tpGroup, (std::vector<tr::SizeType32>{1, 3}));
EXPECT_EQ(ppGroup, (std::vector<tr::SizeType32>{3, 7}));
}
// Test group membership for rank 5 (ppRank=1, tpRank=0, cpRank=1).
// CP group: all ranks with same (ppRank=1, tpRank=0) = [4, 5].
// TP group: all ranks with same (ppRank=1, cpRank=1) = [5, 7].
// PP group: all ranks with same (tpRank=0, cpRank=1) = [1, 5].
{
tr::WorldConfig config(tp, pp, cp, 5, gpusPerNode);
auto cpGroup = config.getContextParallelGroup();
auto tpGroup = config.getTensorParallelGroup();
auto ppGroup = config.getPipelineParallelGroup();
EXPECT_EQ(cpGroup, (std::vector<tr::SizeType32>{4, 5}));
EXPECT_EQ(tpGroup, (std::vector<tr::SizeType32>{5, 7}));
EXPECT_EQ(ppGroup, (std::vector<tr::SizeType32>{1, 5}));
}
}
TEST(WorldConfig, ParallelGroupsLargerConfig)
{
// Test with tp=2, pp=2, cp=4, worldSize=16.
auto constexpr tp = 2;
auto constexpr pp = 2;
auto constexpr cp = 4;
auto constexpr gpusPerNode = 16;
// Rank 9: ppRank = 9 / (2*4) = 1, tpRank = (9 % 8) / 4 = 0, cpRank = 9 % 4 = 1.
// CP group: ranks with same (ppRank=1, tpRank=0) = [8, 9, 10, 11].
// TP group: ranks with same (ppRank=1, cpRank=1) = [9, 13].
// PP group: ranks with same (tpRank=0, cpRank=1) = [1, 9].
{
tr::WorldConfig config(tp, pp, cp, 9, gpusPerNode);
EXPECT_EQ(config.getPipelineParallelRank(), 1);
EXPECT_EQ(config.getTensorParallelRank(), 0);
EXPECT_EQ(config.getContextParallelRank(), 1);
auto cpGroup = config.getContextParallelGroup();
auto tpGroup = config.getTensorParallelGroup();
auto ppGroup = config.getPipelineParallelGroup();
EXPECT_EQ(cpGroup, (std::vector<tr::SizeType32>{8, 9, 10, 11}));
EXPECT_EQ(tpGroup, (std::vector<tr::SizeType32>{9, 13}));
EXPECT_EQ(ppGroup, (std::vector<tr::SizeType32>{1, 9}));
}
}

View File

@ -118,8 +118,10 @@ class DeviceMeshTopologyImpl(_MappingBaseForTypeCheck):
"DeviceMesh creation requested but torch.distributed process group " "DeviceMesh creation requested but torch.distributed process group "
"has not been initialised.") "has not been initialised.")
dims = ["cp", "pp"] # Dimensions go from slowest-varying (outermost) to fastest-varying (innermost).
shape = [self.cp_size, self.pp_size] # Layout: pp is outermost, then tp, then cp is innermost (consecutive).
dims = ["pp"]
shape = [self.pp_size]
if self.moe_ep_size > 1: if self.moe_ep_size > 1:
dims += ["moe_tp", "moe_ep"] dims += ["moe_tp", "moe_ep"]
@ -128,6 +130,9 @@ class DeviceMeshTopologyImpl(_MappingBaseForTypeCheck):
dims += ["tp"] dims += ["tp"]
shape += [self.tp_size] shape += [self.tp_size]
dims += ["cp"]
shape += [self.cp_size]
cls.device_mesh = init_device_mesh( cls.device_mesh = init_device_mesh(
"cuda", "cuda",
mesh_shape=tuple(shape), mesh_shape=tuple(shape),

View File

@ -292,18 +292,16 @@ class MappingBase:
return self.cp_size > 1 return self.cp_size > 1
def prev_cp_rank(self): def prev_cp_rank(self):
p = self.rank - self.tp_size # cp ranks are consecutive, so prev is rank - 1 with wraparound within cp group.
if p // (self.tp_size * self.cp_size) < self.rank // (self.tp_size * if self.cp_rank == 0:
self.cp_size): return self.rank + self.cp_size - 1
return p + self.tp_size * self.cp_size return self.rank - 1
return p
def next_cp_rank(self): def next_cp_rank(self):
p = self.rank + self.tp_size # cp ranks are consecutive, so next is rank + 1 with wraparound within cp group.
if p // (self.tp_size * self.cp_size) > self.rank // (self.tp_size * if self.cp_rank == self.cp_size - 1:
self.cp_size): return self.rank - self.cp_size + 1
return p - self.tp_size * self.cp_size return self.rank + 1
return p
def has_moe_cluster(self): def has_moe_cluster(self):
return self.moe_cluster_size > 1 return self.moe_cluster_size > 1
@ -378,17 +376,17 @@ class Mapping(MappingBase):
A node with 8 GPUs, tp_size = 4, cp_size = 2, pp_size = 1 A node with 8 GPUs, tp_size = 4, cp_size = 2, pp_size = 1
2 tp groups:
- [0, 1, 2, 3]
- [4, 5, 6, 7]
4 cp groups: 4 cp groups:
- [0, 4] - [0, 1]
- [1, 5] - [2, 3]
- [2, 6] - [4, 5]
- [3, 7] - [6, 7]
2 tp groups:
- [0, 2, 4, 6]
- [1, 3, 5, 7]
A node with 8 GPUs, moe_tp_size = 2, moe_ep_size = 4 A node with 8 GPUs, moe_tp_size = 2, moe_ep_size = 4
@ -437,23 +435,23 @@ class Mapping(MappingBase):
2 nodes with 8 GPUs, tp_size 2, pp_size 2, cp_size 2 2 nodes with 8 GPUs, tp_size 2, pp_size 2, cp_size 2
4 tp groups: 4 cp groups:
- [0, 1] - [0, 1]
- [2, 3] - [2, 3]
- [4, 5] - [4, 5]
- [6, 7] - [6, 7]
4 tp groups:
- [0, 2]
- [1, 3]
- [4, 6]
- [5, 7]
4 pp groups: 4 pp groups:
- [0, 4] - [0, 4]
- [1, 5] - [1, 5]
- [2, 6] - [2, 6]
- [3, 7] - [3, 7]
4 cp groups:
- [0, 2]
- [1, 3]
- [4, 6]
- [5, 7]
""" """
def __new__(cls, *args, **kwargs): def __new__(cls, *args, **kwargs):
@ -551,7 +549,7 @@ class MpiTopology(Mapping):
@property @property
def tp_rank(self) -> int: def tp_rank(self) -> int:
return self.rank % self.tp_size return self.rank % (self.tp_size * self.cp_size) // self.cp_size
@property @property
def pp_rank(self) -> int: def pp_rank(self) -> int:
@ -559,7 +557,7 @@ class MpiTopology(Mapping):
@property @property
def cp_rank(self) -> int: def cp_rank(self) -> int:
return self.rank % (self.tp_size * self.cp_size) // self.tp_size return self.rank % self.cp_size
@property @property
def tp_group(self) -> List[int]: def tp_group(self) -> List[int]:
@ -567,7 +565,7 @@ class MpiTopology(Mapping):
@property @property
def pp_group(self) -> List[int]: def pp_group(self) -> List[int]:
return self.pp_groups[self.cp_rank * self.tp_size + self.tp_rank] return self.pp_groups[self.tp_rank * self.cp_size + self.cp_rank]
@property @property
def cp_group(self) -> List[int]: def cp_group(self) -> List[int]:
@ -598,20 +596,20 @@ class MpiTopology(Mapping):
ranks = range(i, self.world_size, self.tp_size * self.cp_size) ranks = range(i, self.world_size, self.tp_size * self.cp_size)
self.pp_groups.append(list(ranks)) self.pp_groups.append(list(ranks))
# init cp group # init cp group (consecutive ranks within each tp slice).
for i in range(self.pp_size): for i in range(self.pp_size):
for j in range(self.tp_size): for j in range(self.tp_size):
ranks = range(i * self.tp_size * self.cp_size + j, ranks = range(
(i + 1) * self.tp_size * self.cp_size + j, i * self.tp_size * self.cp_size + j * self.cp_size,
self.tp_size) i * self.tp_size * self.cp_size + (j + 1) * self.cp_size)
self.cp_groups.append(list(ranks)) self.cp_groups.append(list(ranks))
# init tp group # init tp group (interleaved ranks with stride of cp_size).
for i in range(self.pp_size): for i in range(self.pp_size):
for j in range(self.cp_size): for j in range(self.cp_size):
ranks = range( ranks = range(i * self.tp_size * self.cp_size + j,
i * self.tp_size * self.cp_size + j * self.tp_size, (i + 1) * self.tp_size * self.cp_size + j,
i * self.tp_size * self.cp_size + (j + 1) * self.tp_size) self.cp_size)
self.tp_groups.append(list(ranks)) self.tp_groups.append(list(ranks))
# init moe tp group # init moe tp group

View File

@ -740,10 +740,11 @@ class PretrainedModel(Module,
rank = config.mapping.rank rank = config.mapping.rank
if config.mapping.cp_size > 1: if config.mapping.cp_size > 1:
# tp_cp_pp rank -> tp_pp rank: because different cp ranks share the same ckpt # cp_tp_pp rank -> tp_pp rank: because different cp ranks share the same ckpt.
tp_size = config.mapping.tp_size
cp_size = config.mapping.cp_size cp_size = config.mapping.cp_size
rank = rank % tp_size + rank // (tp_size * cp_size) * tp_size # rank = pp_rank × tp_size × cp_size + tp_rank × cp_size + cp_rank.
# rank // cp_size is equivalent to pp_rank × tp_size + tp_rank.
rank = rank // cp_size
weights_path = os.path.join(ckpt_dir, f'rank{rank}.safetensors') weights_path = os.path.join(ckpt_dir, f'rank{rank}.safetensors')
assert os.path.isfile(weights_path) assert os.path.isfile(weights_path)

View File

@ -872,8 +872,9 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
task.evaluate(llm) task.evaluate(llm)
@pytest.mark.skip_less_device(8) @pytest.mark.skip_less_device(8)
@pytest.mark.parametrize("gen_pp,gen_tp,gen_cp", [(1, 2, 2), (2, 1, 2)], @pytest.mark.parametrize("gen_pp,gen_tp,gen_cp", [(1, 1, 4), (1, 2, 2),
ids=["pp1tp2cp2", "pp2tp1cp2"]) (2, 1, 2)],
ids=["pp1tp1cp4", "pp1tp2cp2", "pp2tp1cp2"])
@pytest.mark.parametrize("cuda_graph_config", [ @pytest.mark.parametrize("cuda_graph_config", [
None, None,
{ {

View File

@ -540,6 +540,14 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2] accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=0] accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=0]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2] accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp2cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp2cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp2cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp2cp2]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp1tp1cp4]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp1cp4]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:none-pp1tp1cp4]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp1cp4]
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False] accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False]
accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True] accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp2tp1cp2] accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:none-pp2tp1cp2]

View File

@ -72,6 +72,8 @@ l0_dgx_b200:
orchestrator: mpi orchestrator: mpi
tests: tests:
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp2tp1cp2] TIMEOUT (60) - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp2tp1cp2] TIMEOUT (60)
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp2cp2] TIMEOUT (60)
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp1tp1cp4] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] TIMEOUT (60) - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] TIMEOUT (60) - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_bs8_mtp] TIMEOUT (60) - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_bs8_mtp] TIMEOUT (60)
@ -100,6 +102,8 @@ l0_dgx_b200:
orchestrator: mpi orchestrator: mpi
tests: tests:
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp2tp1cp2] TIMEOUT (60) - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[fifo-cudagraph:with_padding-pp2tp1cp2] TIMEOUT (60)
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp2cp2] TIMEOUT (60)
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix[nccl-cudagraph:with_padding-pp1tp1cp4] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_corner_case TIMEOUT (60) - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_corner_case TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_fp8kv] TIMEOUT (60) - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_fp8kv] TIMEOUT (60)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] TIMEOUT (60) - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] TIMEOUT (60)

View File

@ -57,27 +57,27 @@ class TestMapping(unittest.TestCase):
self.assertEqual(len(m.tp_groups), 4) self.assertEqual(len(m.tp_groups), 4)
self.assertEqual(len(m.pp_groups), 4) self.assertEqual(len(m.pp_groups), 4)
self.assertEqual(len(m.cp_groups), 4) self.assertEqual(len(m.cp_groups), 4)
self.assertEqual(m.tp_group, [2, 3]) self.assertEqual(m.tp_group, [1, 3])
self.assertEqual(m.pp_group, [3, 7]) self.assertEqual(m.pp_group, [3, 7])
self.assertEqual(m.cp_group, [1, 3]) self.assertEqual(m.cp_group, [2, 3])
self.assertTrue(m.is_first_pp_rank()) self.assertTrue(m.is_first_pp_rank())
self.assertFalse(m.is_last_pp_rank()) self.assertFalse(m.is_last_pp_rank())
self.assertFalse(m.is_first_cp_rank()) self.assertFalse(m.is_first_cp_rank())
self.assertTrue(m.is_last_cp_rank()) self.assertTrue(m.is_last_cp_rank())
self.assertEqual(m.prev_pp_rank(), 7) self.assertEqual(m.prev_pp_rank(), 7)
self.assertEqual(m.next_pp_rank(), 7) self.assertEqual(m.next_pp_rank(), 7)
self.assertEqual(m.prev_cp_rank(), 1) self.assertEqual(m.prev_cp_rank(), 2)
self.assertEqual(m.next_cp_rank(), 1) self.assertEqual(m.next_cp_rank(), 2)
m = Mapping(world_size=16, rank=9, tp_size=2, pp_size=2, cp_size=4) m = Mapping(world_size=16, rank=9, tp_size=2, pp_size=2, cp_size=4)
self.assertEqual(m.tp_group, [8, 9]) self.assertEqual(m.tp_group, [9, 13])
self.assertEqual(m.pp_group, [1, 9]) self.assertEqual(m.pp_group, [1, 9])
self.assertEqual(m.cp_group, [9, 11, 13, 15]) self.assertEqual(m.cp_group, [8, 9, 10, 11])
self.assertFalse(m.is_first_pp_rank()) self.assertFalse(m.is_first_pp_rank())
self.assertTrue(m.is_last_pp_rank()) self.assertTrue(m.is_last_pp_rank())
self.assertTrue(m.is_first_cp_rank()) self.assertFalse(m.is_first_cp_rank())
self.assertFalse(m.is_last_cp_rank()) self.assertFalse(m.is_last_cp_rank())
self.assertEqual(m.prev_pp_rank(), 1) self.assertEqual(m.prev_pp_rank(), 1)
self.assertEqual(m.next_pp_rank(), 1) self.assertEqual(m.next_pp_rank(), 1)
self.assertEqual(m.prev_cp_rank(), 15) self.assertEqual(m.prev_cp_rank(), 8)
self.assertEqual(m.next_cp_rank(), 11) self.assertEqual(m.next_cp_rank(), 10)