mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
39910f2b25
Signed-off-by: Bugen Zhao <i@bugenzhao.com> Signed-off-by: Nick Hill <nickhill123@gmail.com> Signed-off-by: Eric Curtin <eric.curtin@docker.com> Signed-off-by: Dev-X25874 <283057883+Dev-X25874@users.noreply.github.com> Signed-off-by: Will.hou <1205157517@qq.com> Signed-off-by: Will.hou <willamhou@ceresman.com> Co-authored-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: Eric Curtin <eric.curtin@docker.com> Co-authored-by: Dev-X25874 <283057883+Dev-X25874@users.noreply.github.com> Co-authored-by: Will.hou <1205157517@qq.com> Co-authored-by: Will.hou <willamhou@ceresman.com> Please see https://github.com/Inferact/vllm-frontend-rs for full original commit history.
197 lines
5.3 KiB
Protocol Buffer
197 lines
5.3 KiB
Protocol Buffer
syntax = "proto3";
|
|
package vllm;
|
|
|
|
import "google/protobuf/struct.proto";
|
|
|
|
|
|
service Generate {
|
|
// Generates text given a prompt
|
|
rpc Generate (GenerateRequest) returns (GenerateResponse) {}
|
|
// Generates text given a prompt, streaming the outputs
|
|
rpc GenerateStream (GenerateRequest) returns (stream GenerateResponse) {}
|
|
}
|
|
|
|
// ======================================================================================
|
|
// Generate Request
|
|
// ======================================================================================
|
|
|
|
message GenerateRequest {
|
|
string request_id = 1;
|
|
string model = 2;
|
|
|
|
oneof prompt {
|
|
string text = 3;
|
|
TokenIds token_ids = 4;
|
|
}
|
|
|
|
// Temperature, defaults to model-specific default or 0
|
|
optional float temperature = 5;
|
|
// Parameters controlling random sampling, not applicable if temperature == 0
|
|
RandomSampling sampling = 6;
|
|
// Parameters for conditionally penalizing/boosting
|
|
// candidate tokens during decoding
|
|
DecodingParameters decoding = 7;
|
|
// Parameters controlling when generation should stop
|
|
StoppingCriteria stopping = 8;
|
|
// Flags to control what is returned in the response
|
|
ResponseOptions response = 9;
|
|
// Parameters controlling KV cache/distribution
|
|
KVCacheParameters kv = 10;
|
|
|
|
// Truncate prompt tokens; default (0) means no truncation
|
|
uint32 truncate_prompt_tokens = 11;
|
|
|
|
int32 priority = 12;
|
|
}
|
|
|
|
message RandomSampling {
|
|
uint32 num_sequences = 1; // "n", default (0) means 1
|
|
uint32 top_k = 2; // 0 means default
|
|
float top_p = 3; // 0 means default
|
|
float min_p = 4; // 0 means default
|
|
optional int64 seed = 5;
|
|
}
|
|
|
|
message DecodingParameters {
|
|
// Penalties
|
|
float presence_penalty = 1; // Default (0.0) means no penalty
|
|
float frequency_penalty = 2; // Default (0.0) means no penalty
|
|
float repetition_penalty = 3; // Default (0.0) means no penalty
|
|
map<uint32, float> logit_bias = 4;
|
|
repeated uint32 allowed_token_ids = 5;
|
|
|
|
message StringChoices {
|
|
repeated string choices = 1;
|
|
}
|
|
|
|
// Control structured outputs
|
|
oneof structured_output {
|
|
string json = 6;
|
|
string regex = 7;
|
|
StringChoices choice = 8;
|
|
string grammar = 9;
|
|
bool json_object = 10;
|
|
string structural_tag = 11;
|
|
}
|
|
}
|
|
|
|
message StoppingCriteria {
|
|
// Default (0) is currently 20
|
|
uint32 max_new_tokens = 1;
|
|
// Default (0) means no minimum
|
|
uint32 min_new_tokens = 2;
|
|
|
|
repeated uint32 stop_token_ids = 3;
|
|
repeated string stop_strings = 4;
|
|
bool include_stop_strings = 5;
|
|
|
|
bool ignore_eos = 6;
|
|
}
|
|
|
|
message ResponseOptions {
|
|
// Prompt options
|
|
bool prompt_token_ids = 1;
|
|
bool prompt_logprobs = 2;
|
|
optional CandidateTokens prompt_candidates = 3;
|
|
|
|
// Output options; output_text defaults to true
|
|
optional bool output_text = 4;
|
|
bool output_token_ids = 5;
|
|
bool output_logprobs = 6;
|
|
optional CandidateTokens output_candidates = 7;
|
|
}
|
|
|
|
message KVCacheParameters {
|
|
bool bypass_prefix_cache = 1;
|
|
string cache_salt = 2;
|
|
|
|
// KV Connector transfer parameters
|
|
google.protobuf.Struct kv_transfer_params = 3;
|
|
}
|
|
|
|
// Controls which extra candidate tokens at each position should be returned
|
|
message CandidateTokens {
|
|
oneof select {
|
|
uint32 top_n = 1;
|
|
TokenIds token_ids = 2;
|
|
bool all = 3;
|
|
}
|
|
}
|
|
|
|
// ======================================================================================
|
|
// Generate Response
|
|
// ======================================================================================
|
|
|
|
message GenerateResponse {
|
|
// Only present in first response
|
|
optional PromptInfo prompt_info = 1;
|
|
SequenceOutput outputs = 2;
|
|
}
|
|
|
|
message SequenceOutput {
|
|
// Index of output sequence for num_sequences > 1.
|
|
uint32 index = 1;
|
|
|
|
string text = 2;
|
|
uint32 num_tokens = 3; // Number of tokens in this chunk
|
|
repeated uint32 token_ids = 4; // If requested
|
|
repeated float logprobs = 5; // If requested
|
|
repeated uint32 ranks = 6; // If logprobs were requested
|
|
repeated CandidateTokenInfo candidate_tokens = 7; // If requested
|
|
|
|
// Only present in final output for this sequence
|
|
optional FinishInfo finish_info = 8;
|
|
}
|
|
|
|
// Prompt info, returned in the first response
|
|
message PromptInfo {
|
|
uint32 num_prompt_tokens = 1;
|
|
repeated uint32 token_ids = 2; // If requested
|
|
repeated float logprobs = 3; // If requested
|
|
repeated uint32 ranks = 4; // If logprobs were requested
|
|
repeated CandidateTokenInfo candidate_tokens = 5;
|
|
}
|
|
|
|
// Finish info, returned in the final response
|
|
message FinishInfo {
|
|
uint32 num_output_tokens = 1;
|
|
|
|
enum FinishReason {
|
|
NOT_FINISHED = 0; // Possibly more tokens to be streamed
|
|
LENGTH = 1; // Finished due to length constraint
|
|
STOP = 2; // Stop string/token or EOS encountered
|
|
ABORTED = 3; // Request aborted/cancelled
|
|
}
|
|
|
|
FinishReason finish_reason = 2;
|
|
// One of these will be set when finish_reason == STOP
|
|
oneof stop_reason {
|
|
uint32 stop_token_id = 3;
|
|
uint32 eos_token_id = 4;
|
|
string stop_string = 5;
|
|
}
|
|
|
|
google.protobuf.Struct kv_transfer_params = 6;
|
|
//uint64 seed = 7;
|
|
}
|
|
|
|
// Info for candidate tokens other than the input/sampled
|
|
// token at a given position
|
|
message CandidateTokenInfo {
|
|
message TokenInfo {
|
|
uint32 id = 1;
|
|
float logprob = 2;
|
|
uint32 rank = 3;
|
|
// string text = 4;
|
|
// bytes token_bytes = 5;
|
|
}
|
|
// Candidate token infos at this position
|
|
repeated TokenInfo tokens = 1;
|
|
}
|
|
|
|
// Token ids used for prompt
|
|
message TokenIds {
|
|
repeated uint32 ids = 1;
|
|
}
|
|
|