TensorRT-LLMs/cpp/tensorrt_llm/executor/jsonSerialization.cpp
wili 56cdfe5c6c
[TRTLLM-5000][feat] NGrams V2 (#4569)
Signed-off-by: wili-65535 <wili-65535@users.noreply.github.com>
Co-authored-by: wili-65535 <wili-65535@users.noreply.github.com>
2025-06-27 23:00:17 +08:00

70 lines
3.3 KiB
C++

/*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tensorrt_llm/common/jsonSerializeOptional.h"
#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/executor/types.h"
#include <nlohmann/json.hpp>
using json = nlohmann::json;
namespace tensorrt_llm::executor
{
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(KvCacheStats, maxNumBlocks, freeNumBlocks, usedNumBlocks, tokensPerBlock,
allocTotalBlocks, allocNewBlocks, reusedBlocks, missedBlocks, cacheHitRate);
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(
StaticBatchingStats, numScheduledRequests, numContextRequests, numCtxTokens, numGenTokens, emptyGenSlots);
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(InflightBatchingStats, numScheduledRequests, numContextRequests, numGenRequests,
numPausedRequests, numCtxTokens, microBatchId, avgNumDecodedTokensPerIter);
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(SpecDecodingStats, numDraftTokens, numAcceptedTokens, numRequestsWithDraftTokens,
acceptanceLength, iterLatencyMS, draftOverhead);
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(IterationStats, timestamp, iter, iterLatencyMS, newActiveRequestsQueueLatencyMS,
numNewActiveRequests, numActiveRequests, numQueuedRequests, numCompletedRequests, maxNumActiveRequests,
maxBatchSizeStatic, maxBatchSizeTunerRecommended, maxBatchSizeRuntime, maxNumTokensStatic,
maxNumTokensTunerRecommended, maxNumTokensRuntime, gpuMemUsage, cpuMemUsage, pinnedMemUsage, kvCacheStats,
staticBatchingStats, inflightBatchingStats, specDecodingStats);
NLOHMANN_JSON_SERIALIZE_ENUM(RequestStage,
{{RequestStage::kQUEUED, "QUEUED"}, {RequestStage::kCONTEXT_IN_PROGRESS, "CONTEXT_IN_PROGRESS"},
{RequestStage::kGENERATION_IN_PROGRESS, "GENERATION_IN_PROGRESS"},
{RequestStage::kGENERATION_COMPLETE, "GENERATION_COMPLETE"}});
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(DisServingRequestStats, kvCacheTransferMS, kvCacheSize);
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(RequestStats, id, stage, contextPrefillPosition, numGeneratedTokens,
avgNumDecodedTokensPerIter, scheduled, paused, disServingStats, allocTotalBlocksPerRequest,
allocNewBlocksPerRequest, reusedBlocksPerRequest, missedBlocksPerRequest, kvCacheHitRatePerRequest);
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(RequestStatsPerIteration, iter, requestStats);
std::string JsonSerialization::toJsonStr(IterationStats const& iterationStats)
{
json j = iterationStats;
return j.dump();
}
std::string JsonSerialization::toJsonStr(RequestStatsPerIteration const& requestStatsPerIter)
{
json j = requestStatsPerIter;
return j.dump();
}
std::string JsonSerialization::toJsonStr(RequestStats const& requestStats)
{
json j = requestStats;
return j.dump();
}
} // namespace tensorrt_llm::executor