Update examples and readmes

2026-01-14 06:07:17 +08:00 · 2024-11-19 14:31:18 -08:00 · 2024-11-19 14:31:18 -08:00 · ffe7588093
commit ffe7588093
parent 238f142a5c
7 changed files with 143 additions and 107 deletions
--- a/README.md
+++ b/README.md
@ -4,8 +4,9 @@ The Ollama Python library provides the easiest way to integrate Python 3.8+ proj

 ## Prerequisites

- Install [Ollama](https://ollama.com/download)
- Pull a model: `ollama pull <model>` See [Ollama models](https://ollama.com/models)
+- [Ollama](https://ollama.com/download) should be installed and running
+- Pull a model to use with the library: `ollama pull <model>` e.g. `ollama pull llama3.1`
+  - See [Ollama models](https://ollama.com/models) for more information on the models available.

 ## Install

@ -16,24 +17,32 @@ pip install ollama
 ## Usage

 ```python
-import ollama
-response = ollama.chat(model='llama3.1', messages=[
+from ollama import chat
+from ollama._types import ChatResponse
+
+response: ChatResponse = chat(model='llama3.1', messages=[
  {
    'role': 'user',
    'content': 'Why is the sky blue?',
  },
 ])
 print(response['message']['content'])
+# or access fields directly from the response object
+print(response.message.content)
 ```

+See [_types.py](ollama/_types.py) for more information on the response types.
+
 ## Streaming responses

-Response streaming can be enabled by setting `stream=True`, modifying function calls to return a Python generator where each part is an object in the stream.
+Response streaming can be enabled by setting `stream=True`
+
+Streaming Tool/Function calling is not yet supported.

 ```python
-import ollama
+from ollama import chat

-stream = ollama.chat(
+stream = chat(
    model='llama3.1',
    messages=[{'role': 'user', 'content': 'Why is the sky blue?'}],
    stream=True,
@ -43,6 +52,62 @@ for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)
 ```

+## Custom client
+
+A custom client can be created with the following fields:
+
+- `host`: The Ollama host (default: `http://localhost:11434`)
+- `timeout`: The timeout for requests (default: `None`)
+- `follow_redirects`: Whether to follow redirects (default: `True`)
+- `headers`: Additional headers to send with requests (default: `{}`)
+
+```python
+from ollama import Client
+client = Client()
+# or 
+client = Client(
+  host='http://localhost:11434',
+  timeout=None,
+  follow_redirects=True,
+  headers={'x-some-header': 'some-value'}
+)
+response = client.chat(model='llama3.1', messages=[
+  {
+    'role': 'user',
+    'content': 'Why is the sky blue?',
+  },
+])
+```
+
+## Async client
+
+The `AsyncClient` class is used to make asynchronous requests. It can be configured with the same fields as the `Client` class.
+
+```python
+import asyncio
+from ollama import AsyncClient
+
+async def chat():
+  message = {'role': 'user', 'content': 'Why is the sky blue?'}
+  response = await AsyncClient().chat(model='llama3.1', messages=[message])
+
+asyncio.run(chat())
+```
+
+Setting `stream=True` modifies functions to return a Python asynchronous generator:
+
+```python
+import asyncio
+from ollama import AsyncClient
+
+async def chat():
+  message = {'role': 'user', 'content': 'Why is the sky blue?'}
+  async for part in await AsyncClient().chat(model='llama3.1', messages=[message], stream=True):
+    print(part['message']['content'], end='', flush=True)
+
+asyncio.run(chat())
+```
+
 ## API

 The Ollama Python library's API is designed around the [Ollama REST API](https://github.com/ollama/ollama/blob/main/docs/api.md)
@ -124,50 +189,6 @@ ollama.embed(model='llama3.1', input=['The sky is blue because of rayleigh scatt
 ollama.ps()
 ```

-## Custom client
-
-A custom client can be created with the following fields:
-
- `host`: The Ollama host to connect to
- `timeout`: The timeout for requests
-
-```python
-from ollama import Client
-client = Client(host='http://localhost:11434')
-response = client.chat(model='llama3.1', messages=[
-  {
-    'role': 'user',
-    'content': 'Why is the sky blue?',
-  },
-])
-```
-
-## Async client
-
-```python
-import asyncio
-from ollama import AsyncClient
-
-async def chat():
-  message = {'role': 'user', 'content': 'Why is the sky blue?'}
-  response = await AsyncClient().chat(model='llama3.1', messages=[message])
-
-asyncio.run(chat())
-```
-
-Setting `stream=True` modifies functions to return a Python asynchronous generator:
-
-```python
-import asyncio
-from ollama import AsyncClient
-
-async def chat():
-  message = {'role': 'user', 'content': 'Why is the sky blue?'}
-  async for part in await AsyncClient().chat(model='llama3.1', messages=[message], stream=True):
-    print(part['message']['content'], end='', flush=True)
-
-asyncio.run(chat())
-```

 ## Errors

--- a/examples/README.md
+++ b/examples/README.md
@ -9,53 +9,47 @@ cd examples/
 python3 <example>.py
 ```

-### Chat
- [chat.py](chat.py) - Basic chat with model
- [chat-stream.py](chat-stream.py) - Stream chat with model
- [async-chat.py](async-chat.py) - Async chat with model
+### Chat - Chat with a model
+- [chat.py](chat.py)
+- [async-chat.py](async-chat.py)
+- [chat-stream.py](chat-stream.py) - Streamed outputs

-### Generate
- [generate.py](generate.py) - Generate text with model
- [generate-stream.py](generate-stream.py) - Stream generate text with model
- [async-generate.py](async-generate.py) - Async generate text with model
+### Generate - Generate text with a model
+- [generate.py](generate.py)
+- [async-generate.py](async-generate.py)
+- [generate-stream.py](generate-stream.py) - Streamed outputs

-### List
- [list.py](list.py) - List all downloaded models and their properties
- [async-list.py](async-list.py) - Async list all downloaded models and their properties
+### Tools/Function Calling - Call a function with a model
+- [tools.py](tools.py) - Simple example of Tools/Function Calling
+- [async-tools.py](async-tools.py)

-### Fill in the middle
- [fill-in-middle.py](fill-in-middle.py) - Fill in the middle with model
+### Multimodal - Chat with a multimodal model
+- [multimodal_chat.py](multimodal_chat.py)
+- [multimodal_generate.py](multimodal_generate.py)


-### Multimodal
- [multimodal.py](multimodal.py) - Multimodal chat with model

-### Pull Progress
+### Ollama List - List all downloaded models and their properties
+- [list.py](list.py)
+- [async-list.py](async-list.py)
+
+### Ollama Pull - Pull a model from Ollama
 Requirement: `pip install tqdm`

- [pull-progress.py](pull-progress.py) - Pull progress with model
+- [pull.py](pull.py) 

-### Ollama create (create a model)
- [create.py](create.py) - Create a model
-
-### Ollama ps (show model status - cpu/gpu usage)
- [ollama-ps.py](ollama-ps.py) - Ollama ps
-
-### Tools/Function Calling
- [tools.py](tools.py) - Simple example of Tools/Function Calling
- [async-tools.py](async-tools.py) - Async example of Tools/Function Calling
-
-## Configuring Clients
-Custom parameters can be passed to the client when initializing:
+### Ollama Create - Create a model from a Modelfile
 ```python
-import ollama
-client = ollama.Client(
-  host='http://localhost:11434',
-  timeout=10.0, # Default: None
-  follow_redirects=True, # Default: True
-  headers={'x-some-header': 'some-value'}
-)
+python create.py <model> <modelfile>
 ```
+- [create.py](create.py) 
+
+See [ollama/docs/modelfile.md](https://github.com/ollama/ollama/blob/main/docs/modelfile.md) for more information on the Modelfile format.
+
+### Fill in the middle
+- [fill-in-middle.py](fill-in-middle.py) - Given a prefix and suffix, fill in the middle
+
+### Ollama ps - Show model status with CPU/GPU usage
+- [ps.py](ps.py)

-Similarly, the `AsyncClient` class can be configured with the same parameters.

--- a/examples/create.py
+++ b/examples/create.py
@ -8,13 +8,23 @@ if len(args) == 2:
  # create from local file
  path = args[1]
 else:
-  print('usage: python main.py <name> <filepath>')
+  print('usage: python create.py <name> <filepath>')
  sys.exit(1)

 # TODO: update to real Modelfile values
 modelfile = f"""
 FROM {path}
 """
+example_modelfile = """
+FROM llama3.2
+# sets the temperature to 1 [higher is more creative, lower is more coherent]
+PARAMETER temperature 1
+# sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
+PARAMETER num_ctx 4096
+
+# sets a custom system message to specify the behavior of the chat assistant
+SYSTEM You are Mario from super mario bros, acting as an assistant.
+"""

 for response in create(model=args[0], modelfile=modelfile, stream=True):
  print(response['status'])
--- a/examples/multimodal_chat.py
+++ b/examples/multimodal_chat.py
@ -0,0 +1,17 @@
+from ollama import Client
+
+client = Client()
+path = ''
+# Passing in wrong path for image error sucks
+response = client.chat(
+  model='llama3.2-vision',
+  messages=[
+    {
+      'role': 'user',
+      'content': 'What is in this image? Be concise. Respond with the structure {"focal": "...", "subject": "...", "background": "..."}',
+      'images': [path],
+    }
+  ],
+)
+
+print(response.message.content)
--- a/examples/multimodal_generate.py
+++ b/examples/multimodal_generate.py
--- a/examples/ps.py
+++ b/examples/ps.py
@ -1,5 +1,7 @@
-from ollama import ps, pull, chat
+from ollama import ps, pull
+from ollama._types import ProcessResponse

+# Ensure at least one model is loaded
 response = pull('llama3.1', stream=True)
 progress_states = set()
 for progress in response:
@ -10,22 +12,14 @@ for progress in response:

 print('\n')

-response = chat('llama3.1', messages=[{'role': 'user', 'content': 'Hello!'}])
-print(response['message']['content'])

-print('\n')
+response: ProcessResponse = ps()
+for model in response.models:
+  print(f'Model: {model.model}')
+  print(f'Digest: {model.digest}')
+  print(f'Expires at: {model.expires_at}')
+  print(f'Size: {model.size}')
+  print(f'Size vram: {model.size_vram}')
+  print(f'Details: {model.details}')

-response = ps()
-
-name = response['models'][0]['name']
-size = response['models'][0]['size']
-size_vram = response['models'][0]['size_vram']
-
-if size == size_vram:
-  print(f'{name}: 100% GPU')
-elif not size_vram:
-  print(f'{name}: 100% CPU')
-else:
-  size_cpu = size - size_vram
-  cpu_percent = round(size_cpu / size * 100)
-  print(f'{name}: {cpu_percent}% CPU/{100 - cpu_percent}% GPU')
+  print('---' * 10)
--- a/examples/pull-progress.py
+++ b/examples/pull-progress.py