vllm/rust/proto/vllm_grpc.proto

syntax = "proto3";
package vllm;

import "google/protobuf/struct.proto";


service Generate {
  // Generates text given a prompt
  rpc Generate (GenerateRequest) returns (GenerateResponse) {}
  // Generates text given a prompt, streaming the outputs
  rpc GenerateStream (GenerateRequest) returns (stream GenerateResponse) {}
}

// ======================================================================================
// Generate Request
// ======================================================================================

message GenerateRequest {
  string request_id = 1;
  string model = 2;

  oneof prompt {
    string text = 3;
    TokenIds token_ids = 4;
  }

  // Temperature, defaults to model-specific default or 0
  optional float temperature = 5;
  // Parameters controlling random sampling, not applicable if temperature == 0
  RandomSampling sampling = 6;
  // Parameters for conditionally penalizing/boosting
  // candidate tokens during decoding
  DecodingParameters decoding = 7;
  // Parameters controlling when generation should stop
  StoppingCriteria stopping = 8;
  // Flags to control what is returned in the response
  ResponseOptions response = 9;
  // Parameters controlling KV cache/distribution
  KVCacheParameters kv = 10;

  // Truncate prompt tokens; default (0) means no truncation
  uint32 truncate_prompt_tokens = 11;

  int32 priority = 12;
}

message RandomSampling {
  uint32 num_sequences = 1;  // "n", default (0) means 1
  uint32 top_k = 2;  // 0 means default
  float top_p = 3;  // 0 means default
  float min_p = 4;  // 0 means default
  optional int64 seed = 5;
}

message DecodingParameters {
  // Penalties
  float presence_penalty = 1;  // Default (0.0) means no penalty
  float frequency_penalty = 2;  // Default (0.0) means no penalty
  float repetition_penalty = 3;  // Default (0.0) means no penalty
  map<uint32, float> logit_bias = 4;
  repeated uint32 allowed_token_ids = 5;

  message StringChoices {
    repeated string choices = 1;
  }

  // Control structured outputs
  oneof structured_output {
    string json = 6;
    string regex = 7;
    StringChoices choice = 8;
    string grammar = 9;
    bool json_object = 10;
    string structural_tag = 11;
  }
}

message StoppingCriteria {
  // Default (0) is currently 20
  uint32 max_new_tokens = 1;
  // Default (0) means no minimum
  uint32 min_new_tokens = 2;

  repeated uint32 stop_token_ids = 3;
  repeated string stop_strings = 4;
  bool include_stop_strings = 5;

  bool ignore_eos = 6;
}

message ResponseOptions {
  // Prompt options
  bool prompt_token_ids = 1;
  bool prompt_logprobs = 2;
  optional CandidateTokens prompt_candidates = 3;

  // Output options; output_text defaults to true
  optional bool output_text = 4;
  bool output_token_ids = 5;
  bool output_logprobs = 6;
  optional CandidateTokens output_candidates = 7;
}

message KVCacheParameters {
  bool bypass_prefix_cache = 1;
  string cache_salt = 2;

  // KV Connector transfer parameters
  google.protobuf.Struct kv_transfer_params = 3;
}

// Controls which extra candidate tokens at each position should be returned
message CandidateTokens {
  oneof select {
    uint32 top_n = 1;
    TokenIds token_ids = 2;
    bool all = 3;
  }
}

// ======================================================================================
// Generate Response
// ======================================================================================

message GenerateResponse {
  // Only present in first response
  optional PromptInfo prompt_info = 1;
  SequenceOutput outputs = 2;
}

message SequenceOutput {
  // Index of output sequence for num_sequences > 1.
  uint32 index = 1;

  string text = 2;
  uint32 num_tokens = 3;  // Number of tokens in this chunk
  repeated uint32 token_ids = 4;  // If requested
  repeated float logprobs = 5;  // If requested
  repeated uint32 ranks = 6;  // If logprobs were requested
  repeated CandidateTokenInfo candidate_tokens = 7; // If requested

  // Only present in final output for this sequence
  optional FinishInfo finish_info = 8;
}

// Prompt info, returned in the first response
message PromptInfo {
  uint32 num_prompt_tokens = 1;
  repeated uint32 token_ids = 2;  // If requested
  repeated float logprobs = 3;  // If requested
  repeated uint32 ranks = 4;  // If logprobs were requested
  repeated CandidateTokenInfo candidate_tokens = 5;
}

// Finish info, returned in the final response
message FinishInfo {
  uint32 num_output_tokens = 1;

  enum FinishReason {
    NOT_FINISHED = 0;  // Possibly more tokens to be streamed
    LENGTH = 1;  // Finished due to length constraint
    STOP = 2;  // Stop string/token or EOS encountered
    ABORTED = 3;  // Request aborted/cancelled
  }

  FinishReason finish_reason = 2;
  // One of these will be set when finish_reason == STOP
  oneof stop_reason {
    uint32 stop_token_id = 3;
    uint32 eos_token_id = 4;
    string stop_string = 5;
  }

  google.protobuf.Struct kv_transfer_params = 6;
  //uint64 seed = 7;
}

// Info for candidate tokens other than the input/sampled
// token at a given position
message CandidateTokenInfo {
  message TokenInfo {
    uint32 id = 1;
    float logprob = 2;
    uint32 rank = 3;
    //    string text = 4;
    //    bytes token_bytes = 5;
  }
  // Candidate token infos at this position
  repeated TokenInfo tokens = 1;
}

// Token ids used for prompt
message TokenIds {
  repeated uint32 ids = 1;
}