mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[Examples][last/6] Resettle examples. (#41084)
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io> Signed-off-by: wang.yuqi <noooop@126.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -136,8 +136,6 @@ run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
|
||||
run_and_track_test 4 "test_quantization_accuracy.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
|
||||
run_and_track_test 5 "examples/offline_inference/tpu.py" \
|
||||
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
||||
run_and_track_test 6 "test_tpu_model_runner.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
|
||||
run_and_track_test 7 "test_sampler.py" \
|
||||
|
||||
@@ -394,8 +394,8 @@ steps:
|
||||
- python3 pooling/embed/vision_embedding_offline.py --seed 0
|
||||
# Features demo
|
||||
- python3 features/automatic_prefix_caching/prefix_caching_offline.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 deployment/llm_engine_example.py
|
||||
- python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
|
||||
@@ -1649,8 +1649,8 @@ steps:
|
||||
- python3 pooling/embed/vision_embedding_offline.py --seed 0
|
||||
# Features demo
|
||||
- python3 features/automatic_prefix_caching/prefix_caching_offline.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 deployment/llm_engine_example.py
|
||||
- python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
|
||||
@@ -2930,8 +2930,8 @@ steps:
|
||||
- python3 pooling/embed/vision_embedding_offline.py --seed 0
|
||||
# Features demo
|
||||
- python3 features/automatic_prefix_caching/prefix_caching_offline.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 deployment/llm_engine_example.py
|
||||
- python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
|
||||
|
||||
@@ -117,8 +117,8 @@ steps:
|
||||
- python3 pooling/embed/vision_embedding_offline.py --seed 0
|
||||
# for features demo
|
||||
- python3 features/automatic_prefix_caching/prefix_caching_offline.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 deployment/llm_engine_example.py
|
||||
- python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
|
||||
@@ -37,7 +37,7 @@ steps:
|
||||
- examples/generate/multimodal/
|
||||
- examples/features/
|
||||
- examples/pooling/embed/vision_embedding_offline.py
|
||||
- examples/others/tensorize_vllm_model.py
|
||||
- examples/features/tensorize_vllm_model.py
|
||||
commands:
|
||||
- set -x
|
||||
- export VLLM_USE_V2_MODEL_RUNNER=1
|
||||
@@ -55,8 +55,8 @@ steps:
|
||||
- python3 pooling/embed/vision_embedding_offline.py --seed 0
|
||||
# for features demo
|
||||
- python3 features/automatic_prefix_caching/prefix_caching_offline.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 deployment/llm_engine_example.py
|
||||
- python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||
|
||||
+1
-1
@@ -860,7 +860,7 @@ LABEL org.opencontainers.image.source="https://github.com/vllm-project/vllm" \
|
||||
# define sagemaker first, so it is not default from `docker build`
|
||||
FROM vllm-openai-base AS vllm-sagemaker
|
||||
|
||||
COPY examples/online_serving/sagemaker-entrypoint.sh .
|
||||
COPY examples/deployment/sagemaker-entrypoint.sh .
|
||||
RUN chmod +x sagemaker-entrypoint.sh
|
||||
ENTRYPOINT ["./sagemaker-entrypoint.sh"]
|
||||
|
||||
|
||||
@@ -278,7 +278,7 @@ Once your model implements `SupportsTranscription`, you can test the endpoints (
|
||||
http://localhost:8000/v1/audio/translations
|
||||
```
|
||||
|
||||
Or check out more examples in [examples/online_serving](../../../examples/online_serving).
|
||||
Or check out more examples in [examples/speech_to_text](../../../examples/speech_to_text).
|
||||
|
||||
!!! note
|
||||
- If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking.
|
||||
|
||||
@@ -17,7 +17,7 @@ Before you begin, ensure that you have the following:
|
||||
|
||||
## Installing the chart
|
||||
|
||||
This guide uses the Helm chart at [examples/online_serving/chart-helm](../../../examples/online_serving/chart-helm).
|
||||
This guide uses the Helm chart at [examples/deployment/chart-helm](../../../examples/deployment/chart-helm).
|
||||
|
||||
To install the chart with the release name `test-vllm`:
|
||||
|
||||
|
||||
@@ -40,7 +40,7 @@ Deploy the following yaml file `lws.yaml`
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
|
||||
- "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
|
||||
vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2"
|
||||
resources:
|
||||
limits:
|
||||
@@ -73,7 +73,7 @@ Deploy the following yaml file `lws.yaml`
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
|
||||
- "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
|
||||
@@ -36,7 +36,7 @@ pip install -U vllm \
|
||||
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
|
||||
```
|
||||
|
||||
1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_langchain.py](../../../examples/online_serving/retrieval_augmented_generation_with_langchain.py)
|
||||
1. Use the script: [examples/applications/rag/retrieval_augmented_generation_with_langchain.py](../../../examples/applications/rag/retrieval_augmented_generation_with_langchain.py)
|
||||
|
||||
1. Run the script
|
||||
|
||||
@@ -74,7 +74,7 @@ pip install vllm \
|
||||
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
|
||||
```
|
||||
|
||||
1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_llamaindex.py](../../../examples/online_serving/retrieval_augmented_generation_with_llamaindex.py)
|
||||
1. Use the script: [examples/applications/rag/retrieval_augmented_generation_with_llamaindex.py](../../../examples/applications/rag/retrieval_augmented_generation_with_llamaindex.py)
|
||||
|
||||
1. Run the script:
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
|
||||
|
||||
echo 'Starting gradio server...'
|
||||
git clone https://github.com/vllm-project/vllm.git || true
|
||||
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
|
||||
python vllm/examples/applications/chatbot/gradio_openai_chatbot_webserver.py \
|
||||
-m $MODEL_NAME \
|
||||
--port 8811 \
|
||||
--model-url http://localhost:8081/v1 \
|
||||
@@ -305,7 +305,7 @@ It is also possible to access the Llama-3 service with a separate GUI frontend,
|
||||
|
||||
echo 'Starting gradio server...'
|
||||
git clone https://github.com/vllm-project/vllm.git || true
|
||||
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
|
||||
python vllm/examples/applications/api_client/gradio_openai_chatbot_webserver.py \
|
||||
-m $MODEL_NAME \
|
||||
--port 8811 \
|
||||
--model-url http://$ENDPOINT/v1 \
|
||||
|
||||
@@ -20,7 +20,7 @@ pip install vllm streamlit openai
|
||||
vllm serve Qwen/Qwen1.5-0.5B-Chat
|
||||
```
|
||||
|
||||
1. Use the script: [examples/online_serving/streamlit_openai_chatbot_webserver.py](../../../examples/online_serving/streamlit_openai_chatbot_webserver.py)
|
||||
1. Use the script: [examples/applications/chatbot/streamlit_openai_chatbot_webserver.py](../../../examples/applications/chatbot/streamlit_openai_chatbot_webserver.py)
|
||||
|
||||
1. Start the streamlit web UI and start to chat:
|
||||
|
||||
|
||||
@@ -78,7 +78,7 @@ Key points from the example YAML:
|
||||
- sh
|
||||
- -c
|
||||
- >
|
||||
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=2;
|
||||
bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh leader --ray_cluster_size=2;
|
||||
python3 -m vllm.entrypoints.openai.api_server
|
||||
--port 8080
|
||||
--model meta-llama/Llama-3.1-405B-Instruct
|
||||
@@ -93,7 +93,7 @@ Key points from the example YAML:
|
||||
- sh
|
||||
- -c
|
||||
- >
|
||||
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)
|
||||
bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)
|
||||
```
|
||||
|
||||
---
|
||||
@@ -144,7 +144,7 @@ spec:
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=2;
|
||||
- "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh leader --ray_cluster_size=2;
|
||||
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline-parallel-size 2"
|
||||
resources:
|
||||
limits:
|
||||
@@ -178,7 +178,7 @@ spec:
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)"
|
||||
- "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
|
||||
@@ -2,6 +2,6 @@
|
||||
|
||||
vLLM's examples are split into three categories:
|
||||
|
||||
- If you are using vLLM from within Python code, see the [Offline Inference](../../examples/offline_inference) section.
|
||||
- If you are using vLLM from an HTTP application or client, see the [Online Serving](../../examples/online_serving) section.
|
||||
- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the [Others](../../examples/others) section.
|
||||
- If you are using vLLM from within Python code, see the [Offline Inference](.) section.
|
||||
- If you are using vLLM from an HTTP application or client, see the [Online Serving](.) section.
|
||||
- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the [Others](.) section.
|
||||
|
||||
@@ -47,7 +47,7 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
|
||||
To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/llm_engine_example.py \
|
||||
python examples/deployment/llm_engine_example.py \
|
||||
--model TheBloke/Llama-2-7b-Chat-AWQ \
|
||||
--quantization awq
|
||||
```
|
||||
|
||||
@@ -58,7 +58,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
|
||||
To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:
|
||||
|
||||
```bash
|
||||
python examples/offline_inference/llm_engine_example.py \
|
||||
python examples/deployment/llm_engine_example.py \
|
||||
--model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
|
||||
```
|
||||
|
||||
|
||||
@@ -157,7 +157,7 @@ OpenAI Python client library does not officially support `reasoning` attribute f
|
||||
print(content, end="", flush=True)
|
||||
```
|
||||
|
||||
Remember to check whether the `reasoning` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
|
||||
Remember to check whether the `reasoning` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/reasoning/openai_chat_completion_with_reasoning_streaming.py).
|
||||
|
||||
## Tool Calling
|
||||
|
||||
|
||||
@@ -88,7 +88,7 @@ vllm serve facebook/opt-13b \
|
||||
-tp=8
|
||||
```
|
||||
|
||||
By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the [examples/online_serving/run_cluster.sh](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/run_cluster.sh) helper script.
|
||||
By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the [examples/ray_serving/run_cluster.sh](https://github.com/vllm-project/vllm/blob/main/examples/ray_serving/run_cluster.sh) helper script.
|
||||
|
||||
--8<-- [end:supported-features]
|
||||
--8<-- [start:distributed-backend]
|
||||
|
||||
@@ -14,7 +14,7 @@ To install `tensorizer`, run `pip install vllm[tensorizer]`.
|
||||
## The basics
|
||||
|
||||
To load a model using Tensorizer, the model first needs to be serialized by
|
||||
Tensorizer. [The example script](../../examples/others/tensorize_vllm_model.md) takes care of this process.
|
||||
Tensorizer. [The example script](../../../examples/features/tensorize_vllm_model.py) takes care of this process.
|
||||
|
||||
Let's walk through a basic example by serializing `facebook/opt-125m` using the script, and then loading it for inference.
|
||||
|
||||
@@ -25,7 +25,7 @@ CLI arguments. The docstring for the script itself explains the CLI args
|
||||
and how to use it properly in great detail, and we'll use one of the examples from the docstring directly, assuming we want to serialize and save our model at our S3 bucket example `s3://my-bucket`:
|
||||
|
||||
```bash
|
||||
python examples/others/tensorize_vllm_model.py \
|
||||
python examples/features/tensorize_vllm_model.py \
|
||||
--model facebook/opt-125m \
|
||||
serialize \
|
||||
--serialized-directory s3://my-bucket \
|
||||
@@ -35,7 +35,7 @@ python examples/others/tensorize_vllm_model.py \
|
||||
This saves the model tensors at `s3://my-bucket/vllm/facebook/opt-125m/v1`. If you intend on applying a LoRA adapter to your tensorized model, you can pass the HF id of the LoRA adapter in the above command, and the artifacts will be saved there too:
|
||||
|
||||
```bash
|
||||
python examples/others/tensorize_vllm_model.py \
|
||||
python examples/features/tensorize_vllm_model.py \
|
||||
--model facebook/opt-125m \
|
||||
--lora-path <lora_id> \
|
||||
serialize \
|
||||
@@ -71,7 +71,7 @@ llm = LLM(
|
||||
As an example, CPU concurrency can be limited when serializing with `tensorizer` via the `limit_cpu_concurrency` parameter in the initializer for `TensorSerializer`. To set `limit_cpu_concurrency` to some arbitrary value, you would do so like this when serializing:
|
||||
|
||||
```bash
|
||||
python examples/others/tensorize_vllm_model.py \
|
||||
python examples/features/tensorize_vllm_model.py \
|
||||
--model facebook/opt-125m \
|
||||
--lora-path <lora_id> \
|
||||
serialize \
|
||||
|
||||
@@ -251,7 +251,7 @@ The following extra parameters are supported:
|
||||
Our Responses API is compatible with [OpenAI's Responses API](https://platform.openai.com/docs/api-reference/responses);
|
||||
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
|
||||
|
||||
Code example: [examples/online_serving/openai_responses_client_with_tools.py](../../examples/tool_calling/openai_responses_client_with_tools.py)
|
||||
Code example: [examples/tool_calling/openai_responses_client_with_tools.py](../../examples/tool_calling/openai_responses_client_with_tools.py)
|
||||
|
||||
#### Extra parameters
|
||||
|
||||
@@ -456,8 +456,8 @@ Audio must be sent as base64-encoded PCM16 audio at 16kHz sample rate, mono chan
|
||||
|
||||
#### Example Clients
|
||||
|
||||
- [openai_realtime_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_client.py) - Upload and transcribe an audio file
|
||||
- [openai_realtime_microphone_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_microphone_client.py) - Gradio demo for live microphone transcription
|
||||
- [openai_realtime_client.py](https://github.com/vllm-project/vllm/tree/main/examples/speech_to_text/realtime/openai_realtime_client.py) - Upload and transcribe an audio file
|
||||
- [openai_realtime_microphone_client.py](https://github.com/vllm-project/vllm/tree/main/examples/speech_to_text/realtime/openai_realtime_microphone_client.py) - Gradio demo for live microphone transcription
|
||||
|
||||
### Tokenizer API
|
||||
|
||||
|
||||
+1
-1
@@ -5,7 +5,7 @@ Start vLLM API server:
|
||||
vllm serve meta-llama/Llama-2-7b-chat-hf
|
||||
|
||||
Start Gradio OpenAI Chatbot Webserver:
|
||||
python examples/online_serving/gradio_openai_chatbot_webserver.py \
|
||||
python examples/applications/chatbot/gradio_openai_chatbot_webserver.py \
|
||||
-m meta-llama/Llama-2-7b-chat-hf
|
||||
|
||||
Note that `pip install --upgrade gradio` is needed to run this example.
|
||||
+1
-1
@@ -6,7 +6,7 @@ Start vLLM API server:
|
||||
--model meta-llama/Llama-2-7b-chat-hf
|
||||
|
||||
Start Webserver:
|
||||
python examples/online_serving/gradio_webserver.py
|
||||
python examples/applications/chatbot/gradio_webserver.py
|
||||
|
||||
Note that `pip install --upgrade gradio` is needed to run this example.
|
||||
More details: https://github.com/gradio-app/gradio
|
||||
+1
-1
@@ -8,7 +8,7 @@ token-by-token output in offline inference scenarios. It demonstrates DELTA mode
|
||||
streaming where you receive new tokens as they are generated.
|
||||
|
||||
Usage:
|
||||
python examples/offline_inference/async_llm_streaming.py
|
||||
python examples/deployment/async_llm_streaming.py
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
@@ -33,7 +33,7 @@ https://github.com/coreweave/tensorizer
|
||||
To serialize a model, install vLLM from source, then run something
|
||||
like this from the root level of this repository:
|
||||
|
||||
python examples/others/tensorize_vllm_model.py \
|
||||
python examples/features/tensorize_vllm_model.py \
|
||||
--model facebook/opt-125m \
|
||||
serialize \
|
||||
--serialized-directory s3://my-bucket \
|
||||
@@ -53,7 +53,7 @@ providing a `--keyfile` argument.
|
||||
To deserialize a model, you can run something like this from the root
|
||||
level of this repository:
|
||||
|
||||
python examples/others/tensorize_vllm_model.py \
|
||||
python examples/features/tensorize_vllm_model.py \
|
||||
--model EleutherAI/gpt-j-6B \
|
||||
--dtype float16 \
|
||||
deserialize \
|
||||
@@ -71,11 +71,11 @@ shard's rank. Sharded models serialized with this script will be named as
|
||||
model-rank-%03d.tensors
|
||||
|
||||
For more information on the available arguments for serializing, run
|
||||
`python -m examples.others.tensorize_vllm_model serialize --help`.
|
||||
`python -m examples.features.tensorize_vllm_model serialize --help`.
|
||||
|
||||
Or for deserializing:
|
||||
|
||||
`python examples/others/tensorize_vllm_model.py deserialize --help`.
|
||||
`python examples/features/tensorize_vllm_model.py deserialize --help`.
|
||||
|
||||
Once a model is serialized, tensorizer can be invoked with the `LLM` class
|
||||
directly to load models:
|
||||
@@ -100,7 +100,7 @@ vllm serve s3://my-bucket/vllm/facebook/opt-125m/v1 \
|
||||
In order to see all of the available arguments usable to configure
|
||||
loading with tensorizer that are given to `TensorizerConfig`, run:
|
||||
|
||||
`python examples/others/tensorize_vllm_model.py deserialize --help`
|
||||
`python examples/features/tensorize_vllm_model.py deserialize --help`
|
||||
|
||||
under the `tensorizer options` section. These can also be used for
|
||||
deserialization in this example script, although `--tensorizer-uri` and
|
||||
@@ -43,7 +43,7 @@ Both platforms provide equivalent monitoring capabilities:
|
||||
First, navigate to this example's directory:
|
||||
|
||||
```bash
|
||||
cd examples/online_serving/dashboards
|
||||
cd examples/observability/dashboards
|
||||
```
|
||||
|
||||
### Grafana
|
||||
|
||||
@@ -1,26 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from openai import APIConnectionError, OpenAI
|
||||
from openai.pagination import SyncPage
|
||||
from openai.types.model import Model
|
||||
|
||||
|
||||
def get_first_model(client: OpenAI) -> str:
|
||||
"""
|
||||
Get the first model from the vLLM server.
|
||||
"""
|
||||
try:
|
||||
models: SyncPage[Model] = client.models.list()
|
||||
except APIConnectionError as e:
|
||||
raise RuntimeError(
|
||||
"Failed to get the list of models from the vLLM server at "
|
||||
f"{client.base_url} with API key {client.api_key}. Check\n"
|
||||
"1. the server is running\n"
|
||||
"2. the server URL is correct\n"
|
||||
"3. the API key is correct"
|
||||
) from e
|
||||
|
||||
if len(models.data) == 0:
|
||||
raise RuntimeError(f"No models found on the vLLM server at {client.base_url}")
|
||||
|
||||
return models.data[0].id
|
||||
@@ -119,7 +119,7 @@ echo " - API Key: $API_KEY"
|
||||
echo " - Native Pooling: $POOLING_TYPE | Cross-chunk: MEAN"
|
||||
echo ""
|
||||
echo "🧪 Test the server with:"
|
||||
echo " python examples/online_serving/openai_embedding_long_text/client.py"
|
||||
echo " python examples/pooling/embed/openai_embedding_long_text/client.py"
|
||||
echo ""
|
||||
echo "📚 Enhanced features enabled:"
|
||||
echo " ✅ Intelligent native pooling type detection"
|
||||
|
||||
@@ -15,14 +15,14 @@ Start the vLLM server:
|
||||
Then run this script:
|
||||
|
||||
# Use the built-in sample audio
|
||||
python examples/online_serving/openai_lid_client.py
|
||||
python examples/speech_to_text/lid/openai_lid_client.py
|
||||
|
||||
# Use your own audio file(s)
|
||||
python examples/online_serving/openai_lid_client.py \
|
||||
python examples/speech_to_text/lid/openai_lid_client.py \
|
||||
--audio_paths audio_en.wav audio_zh.wav audio_fr.wav
|
||||
|
||||
# Batch-identify multiple files in one run
|
||||
python examples/online_serving/openai_lid_client.py \
|
||||
python examples/speech_to_text/lid/openai_lid_client.py \
|
||||
--audio_paths /path/to/dir/*.wav
|
||||
|
||||
Requirements:
|
||||
|
||||
@@ -41,7 +41,7 @@ compressed-tensors == 0.15.0.1 # required for compressed-tensors
|
||||
depyf==0.20.0 # required for profiling and debugging with compilation config
|
||||
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
|
||||
watchfiles # required for http server to monitor the updates of TLS files
|
||||
python-json-logger # Used by logging as per examples/others/logging_configuration.md
|
||||
python-json-logger # Used by logging as per examples/features/logging_configuration.md
|
||||
ninja # Required for xgrammar, rocm, tpu, xpu
|
||||
pybase64 # fast base64 implementation
|
||||
cbor2 # Required for cross-language serialization of hashable objects
|
||||
|
||||
@@ -184,7 +184,7 @@ def test_tp2_serialize_and_deserialize_lora(
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py",
|
||||
f"{VLLM_PATH}/examples/features/tensorize_vllm_model.py",
|
||||
"--model",
|
||||
MODEL_PATH,
|
||||
"--lora-path",
|
||||
|
||||
@@ -460,7 +460,7 @@ async def test_serialize_and_serve_entrypoints(tmp_path):
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py",
|
||||
f"{VLLM_PATH}/examples/features/tensorize_vllm_model.py",
|
||||
"--model",
|
||||
model_ref,
|
||||
"serialize",
|
||||
|
||||
@@ -139,6 +139,6 @@ class VideoAsset:
|
||||
"""
|
||||
Read audio data from the video asset, used in Qwen2.5-Omni examples.
|
||||
|
||||
See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
|
||||
See also: examples/generate/multimodal/qwen2_5_omni/only_thinker.py
|
||||
"""
|
||||
return load_audio_pyav(self.video_path, sr=sampling_rate)[0]
|
||||
|
||||
@@ -211,7 +211,7 @@ class TensorizerConfig(MutableMapping):
|
||||
encryption_keyfile: File path to a binary file containing a
|
||||
binary key to use for decryption. `None` (the default) means
|
||||
no decryption. See the example script in
|
||||
examples/others/tensorize_vllm_model.py.
|
||||
examples/features/tensorize_vllm_model.py.
|
||||
s3_access_key_id: The access key for the S3 bucket. Can also be set via
|
||||
the S3_ACCESS_KEY_ID environment variable.
|
||||
s3_secret_access_key: The secret access key for the S3 bucket. Can also
|
||||
@@ -579,7 +579,7 @@ def tensorizer_weights_iterator(
|
||||
"loading on vLLM, as tensorizer is forced to load to CPU. "
|
||||
"Consider deserializing a vLLM model instead for faster "
|
||||
"load times. See the "
|
||||
"examples/others/tensorize_vllm_model.py example script "
|
||||
"examples/features/tensorize_vllm_model.py example script "
|
||||
"for serializing vLLM models."
|
||||
)
|
||||
|
||||
|
||||
@@ -73,7 +73,7 @@ class TensorizerLoader(BaseModelLoader):
|
||||
"""Load a serialized model with tensorizer to the CPU.
|
||||
|
||||
This is only necessary when the model isn't vLLM-tensorized (see
|
||||
examples/others/tensorize_vllm_model.py) This should still
|
||||
examples/features/tensorize_vllm_model.py) This should still
|
||||
be faster than default HuggingFace loading, but will be slower than
|
||||
loading a vLLM-tensorized model.
|
||||
"""
|
||||
@@ -104,7 +104,7 @@ class TensorizerLoader(BaseModelLoader):
|
||||
"""Load serialized model weights with tensorizer.
|
||||
|
||||
Expects a vLLM-tensorized model. See the
|
||||
examples/others/tensorize_vllm_model.py example script
|
||||
examples/features/tensorize_vllm_model.py example script
|
||||
for serializing vLLM models."""
|
||||
if is_vllm_tensorized(self.tensorizer_config):
|
||||
tensorizer_config = self._patch_tensorizer_config(model_config)
|
||||
|
||||
Reference in New Issue
Block a user