from typing import Any, TypedDict, Sequence, Literal import sys if sys.version_info < (3, 11): from typing_extensions import NotRequired else: from typing import NotRequired class BaseGenerateResponse(TypedDict): model: str "Model used to generate response." created_at: str "Time when the request was created." done: bool "True if response is complete, otherwise False. Useful for streaming to detect the final response." total_duration: int "Total duration in nanoseconds." load_duration: int "Load duration in nanoseconds." prompt_eval_count: int "Number of tokens evaluated in the prompt." prompt_eval_duration: int "Duration of evaluating the prompt in nanoseconds." eval_count: int "Number of tokens evaluated in inference." eval_duration: int "Duration of evaluating inference in nanoseconds." class GenerateResponse(BaseGenerateResponse): """ Response returned by generate requests. """ response: str "Response content. When streaming, this contains a fragment of the response." context: Sequence[int] "Tokenized history up to the point of the response." class Message(TypedDict): """ Chat message. """ role: Literal['user', 'assistant', 'system'] "Assumed role of the message. Response messages always has role 'assistant'." content: str "Content of the message. Response messages contains message fragments when streaming." images: NotRequired[Sequence[Any]] """ Optional list of image data for multimodal models. Valid input types are: - `str` or path-like object: path to image file - `bytes` or bytes-like object: raw image data Valid image formats depend on the model. See the model card for more information. """ class ChatResponse(BaseGenerateResponse): """ Response returned by chat requests. """ message: Message "Response message." class ProgressResponse(TypedDict): status: str completed: int total: int digest: str class Options(TypedDict, total=False): # load time options numa: bool num_ctx: int num_batch: int num_gqa: int num_gpu: int main_gpu: int low_vram: bool f16_kv: bool logits_all: bool vocab_only: bool use_mmap: bool use_mlock: bool embedding_only: bool rope_frequency_base: float rope_frequency_scale: float num_thread: int # runtime options num_keep: int seed: int num_predict: int top_k: int top_p: float tfs_z: float typical_p: float repeat_last_n: int temperature: float repeat_penalty: float presence_penalty: float frequency_penalty: float mirostat: int mirostat_tau: float mirostat_eta: float penalize_newline: bool stop: Sequence[str] class RequestError(Exception): """ Common class for request errors. """ def __init__(self, content: str): super().__init__(content) self.content = content "Reason for the error." class ResponseError(Exception): """ Common class for response errors. """ def __init__(self, content: str, status_code: int = -1): super().__init__(content) self.content = content "Reason for the error." self.status_code = status_code "HTTP status code of the response."