TensorRT-LLMs/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h
石晓伟 32ed92e449
Update TensorRT-LLM
Co-authored-by: Rong Zhou <130957722+ReginaZh@users.noreply.github.com>
Co-authored-by: Onur Galoglu <33498883+ogaloglu@users.noreply.github.com>
Co-authored-by: Fabian Joswig <fjosw@users.noreply.github.com>
2024-08-20 18:55:15 +08:00

358 lines
10 KiB
C++

/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "tensorrt_llm/runtime/bufferManager.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include "tensorrt_llm/runtime/tensorView.h"
namespace tensorrt_llm::layers
{
template <typename T>
class BufferLocation : public runtime::BufferRange<T>
{
public:
using typename runtime::BufferRange<T>::size_type;
using runtime::BufferRange<T>::begin;
using runtime::BufferRange<T>::operator[];
BufferLocation(T* data, size_type size)
: runtime::BufferRange<T>{data, size}
{
}
template <typename U = T, std::enable_if_t<!std::is_const_v<U>, bool> = true>
explicit BufferLocation(runtime::ITensor& tensor)
: BufferLocation(runtime::bufferCast<U>(tensor), tensor.getSize())
{
mStrides = runtime::ITensor::strides(tensor.getShape());
}
template <typename U = T, std::enable_if_t<std::is_const_v<U>, bool> = true>
explicit BufferLocation(runtime::ITensor const& tensor)
: BufferLocation(runtime::bufferCast<U>(tensor), tensor.getSize())
{
mStrides = runtime::ITensor::strides(tensor.getShape());
}
inline T& at(runtime::ITensor::Shape const& dims)
{
return *ptr(dims);
}
inline T& at(std::initializer_list<runtime::ITensor::DimType64> const& dims)
{
return *ptr(dims);
}
template <typename... Args>
inline T& at(Args... args)
{
runtime::ITensor::DimType64 offset = 0;
runtime::ITensor::DimType64 dims = 0;
atHelper(offset, dims, args...);
return *(begin() + offset);
}
inline T& operator[](runtime::ITensor::Shape const& dims)
{
return *ptr(dims);
}
inline T& operator[](std::initializer_list<runtime::ITensor::DimType64> const& dims)
{
return *ptr(dims);
}
inline T* ptr(runtime::ITensor::Shape const& dims)
{
return begin() + offset(dims);
}
inline T* ptr(std::initializer_list<runtime::ITensor::DimType64> const& dims)
{
return ptr(runtime::ITensor::makeShape(dims));
}
runtime::ITensor::DimType64 offset(runtime::ITensor::Shape const& dims)
{
TLLM_CHECK(mStrides.nbDims == dims.nbDims);
runtime::ITensor::DimType64 result = 0;
for (runtime::ITensor::DimType64 di = 0; di < mStrides.nbDims; di++)
{
result += dims.d[di] * mStrides.d[di];
}
return result;
}
runtime::ITensor::DimType64 offset(std::initializer_list<runtime::ITensor::DimType64> const& dims)
{
return offset(runtime::ITensor::makeShape(dims));
}
private:
inline void atHelper(runtime::ITensor::DimType64& offset, runtime::ITensor::DimType64& dims) {}
template <typename... Args>
inline void atHelper(runtime::ITensor::DimType64& offset, runtime::ITensor::DimType64& dims, int dim, Args... args)
{
offset += dim * mStrides.d[dims++];
atHelper(offset, dims, args...);
}
private:
runtime::ITensor::Shape mStrides;
};
class DebugTensor
{
public:
DebugTensor(runtime::ITensor const& tensor, char const* name)
: mTensor(tensor)
, mName(name)
{
}
DebugTensor(runtime::ITensor::SharedConstPtr tensor, char const* name)
: DebugTensor(*tensor, name)
{
}
uint8_t const& u8(std::initializer_list<runtime::ITensor::DimType64> const& dims)
{
return (BufferLocation<uint8_t const>(mTensor))[dims];
}
uint8_t const& u8(int32_t idx)
{
return (BufferLocation<uint8_t const>(mTensor))[idx];
}
int8_t const& i8(std::initializer_list<runtime::ITensor::DimType64> const& dims)
{
return (BufferLocation<int8_t const>(mTensor))[dims];
}
int8_t const& i8(int32_t idx)
{
return (BufferLocation<int8_t const>(mTensor))[idx];
}
int32_t const& i32(std::initializer_list<runtime::ITensor::DimType64> const& dims)
{
return (BufferLocation<int32_t const>(mTensor))[dims];
}
int32_t const& i32(int32_t idx)
{
return (BufferLocation<int32_t const>(mTensor))[idx];
}
int64_t const& i64(std::initializer_list<runtime::ITensor::DimType64> const& dims)
{
return (BufferLocation<int64_t const>(mTensor))[dims];
}
int64_t const& i64(int32_t idx)
{
return (BufferLocation<int64_t const>(mTensor))[idx];
}
float const& f(std::initializer_list<runtime::ITensor::DimType64> const& dims)
{
return (BufferLocation<float const>(mTensor))[dims];
}
float const& f(int32_t idx)
{
return (BufferLocation<float const>(mTensor))[idx];
}
runtime::BufferManager::ITensorPtr copyToHostOptional()
{
runtime::BufferManager::ITensorPtr hostPtr{nullptr};
if (mTensor.getMemoryType() == runtime::MemoryType::kGPU)
{
runtime::BufferManager manager{std::make_shared<runtime::CudaStream>()};
hostPtr = manager.copyFrom(mTensor, runtime::MemoryType::kCPU);
manager.getStream().synchronize();
}
return hostPtr;
}
std::string string(void)
{
runtime::BufferManager::ITensorPtr hostPtr = copyToHostOptional();
runtime::BufferRange<runtime::TokenIdType const> range(hostPtr ? (*hostPtr) : mTensor);
std::string result(range.size(), '\0');
std::copy(range.begin(), range.end(), result.begin());
return result;
}
std::string tokens(void)
{
using namespace tensorrt_llm::runtime;
std::ostringstream buf;
auto shape = mTensor.getShape();
runtime::BufferManager::ITensorPtr hostPtr = copyToHostOptional();
runtime::BufferRange<runtime::TokenIdType const> tensorRange(hostPtr ? (*hostPtr) : mTensor);
buf << mName << ": " << mTensor.getMemoryTypeName() << ',' << mTensor.getDataTypeName() << ',' << shape;
auto line = [&buf](TokenIdType const* array, SizeType32 size)
{
buf << '[';
for (SizeType32 i = 0; i < size; i++)
{
auto token = array[i];
if (token >= ' ' && token <= '~')
{
buf << '\'' << static_cast<char>(token) << '\'';
}
else
{
buf << token;
}
if (i != size - 1)
{
buf << ',';
}
}
buf << ']';
};
if (shape.nbDims == 0)
{
buf << "[]";
}
else if (shape.nbDims == 1)
{
line(tensorRange.begin(), shape.d[0]);
}
else if (shape.nbDims == 2)
{
buf << '[';
for (runtime::SizeType32 i = 0; i < shape.d[0]; i++)
{
buf << "\n " << i << ": ";
line(tensorRange.begin() + i * shape.d[1], shape.d[1]);
}
buf << ']';
}
else
{
buf << "Too Large to be printed";
}
return buf.str();
}
template <typename T>
std::string values(void)
{
using namespace tensorrt_llm::runtime;
std::ostringstream buf;
auto shape = mTensor.getShape();
runtime::BufferManager::ITensorPtr hostPtr = copyToHostOptional();
runtime::BufferRange<T const> tensorRange(hostPtr ? (*hostPtr) : mTensor);
buf << mName << ": " << mTensor.getMemoryTypeName() << ',' << mTensor.getDataTypeName() << ',' << shape;
auto line = [&buf](T const* array, SizeType32 size)
{
buf << '[';
for (SizeType32 i = 0; i < size; i++)
{
buf << static_cast<unsigned long long>(array[i]);
if (i != size - 1)
{
buf << ',';
}
}
buf << ']';
};
if (shape.nbDims == 0)
{
buf << "[]";
}
else if (shape.nbDims == 1)
{
line(tensorRange.begin(), shape.d[0]);
}
else if (shape.nbDims == 2)
{
buf << '[';
for (runtime::SizeType32 i = 0; i < shape.d[0]; i++)
{
buf << "\n " << i << ": ";
line(tensorRange.begin() + i * shape.d[1], shape.d[1]);
}
buf << ']';
}
else
{
buf << "Too Large to be printed";
}
return buf.str();
}
std::string values(void)
{
switch (mTensor.getDataType())
{
case nvinfer1::DataType::kBOOL: return values<bool>();
case nvinfer1::DataType::kFLOAT: return values<float>();
case nvinfer1::DataType::kINT8: return values<std::int8_t>();
case nvinfer1::DataType::kINT32: return values<std::int32_t>();
case nvinfer1::DataType::kINT64: return values<std::int64_t>();
case nvinfer1::DataType::kUINT8: return values<std::uint8_t>();
default: return std::string(mName + ": Unsupported data type");
}
}
std::string shape(void)
{
using namespace tensorrt_llm::runtime;
std::ostringstream buf;
buf << mName << ": " << mTensor.getShape();
return buf.str();
}
void print_tokens(void)
{
TLLM_LOG_DEBUG(tokens());
}
void print_values(void)
{
TLLM_LOG_DEBUG(values());
}
void print_shape(void)
{
TLLM_LOG_DEBUG(shape());
}
private:
runtime::ITensor const& mTensor;
std::string mName;
};
#define D(x) tensorrt_llm::layers::DebugTensor(x, #x)
#define PRINT_TOKENS(x) D(x).print_tokens()
#define PRINT_VALUES(x) D(x).print_values()
#define PRINT_SHAPE(x) D(x).print_shape()
} // namespace tensorrt_llm::layers