[Examples][last/6] Resettle examples. (#41084)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
wang.yuqi
2026-05-08 10:42:12 +08:00
committed by GitHub
parent 57c2f724c1
commit 1d694e78c9
61 changed files with 60 additions and 88 deletions
@@ -136,8 +136,6 @@ run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
run_and_track_test 4 "test_quantization_accuracy.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
run_and_track_test 5 "examples/offline_inference/tpu.py" \
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
run_and_track_test 6 "test_tpu_model_runner.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
run_and_track_test 7 "test_sampler.py" \
+6 -6
View File
@@ -394,8 +394,8 @@ steps:
- python3 pooling/embed/vision_embedding_offline.py --seed 0
# Features demo
- python3 features/automatic_prefix_caching/prefix_caching_offline.py
- python3 offline_inference/llm_engine_example.py
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 deployment/llm_engine_example.py
- python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -1649,8 +1649,8 @@ steps:
- python3 pooling/embed/vision_embedding_offline.py --seed 0
# Features demo
- python3 features/automatic_prefix_caching/prefix_caching_offline.py
- python3 offline_inference/llm_engine_example.py
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 deployment/llm_engine_example.py
- python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -2930,8 +2930,8 @@ steps:
- python3 pooling/embed/vision_embedding_offline.py --seed 0
# Features demo
- python3 features/automatic_prefix_caching/prefix_caching_offline.py
- python3 offline_inference/llm_engine_example.py
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 deployment/llm_engine_example.py
- python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+2 -2
View File
@@ -117,8 +117,8 @@ steps:
- python3 pooling/embed/vision_embedding_offline.py --seed 0
# for features demo
- python3 features/automatic_prefix_caching/prefix_caching_offline.py
- python3 offline_inference/llm_engine_example.py
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 deployment/llm_engine_example.py
- python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+3 -3
View File
@@ -37,7 +37,7 @@ steps:
- examples/generate/multimodal/
- examples/features/
- examples/pooling/embed/vision_embedding_offline.py
- examples/others/tensorize_vllm_model.py
- examples/features/tensorize_vllm_model.py
commands:
- set -x
- export VLLM_USE_V2_MODEL_RUNNER=1
@@ -55,8 +55,8 @@ steps:
- python3 pooling/embed/vision_embedding_offline.py --seed 0
# for features demo
- python3 features/automatic_prefix_caching/prefix_caching_offline.py
- python3 offline_inference/llm_engine_example.py
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 deployment/llm_engine_example.py
- python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 examples/features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+1 -1
View File
@@ -860,7 +860,7 @@ LABEL org.opencontainers.image.source="https://github.com/vllm-project/vllm" \
# define sagemaker first, so it is not default from `docker build`
FROM vllm-openai-base AS vllm-sagemaker
COPY examples/online_serving/sagemaker-entrypoint.sh .
COPY examples/deployment/sagemaker-entrypoint.sh .
RUN chmod +x sagemaker-entrypoint.sh
ENTRYPOINT ["./sagemaker-entrypoint.sh"]
+1 -1
View File
@@ -278,7 +278,7 @@ Once your model implements `SupportsTranscription`, you can test the endpoints (
http://localhost:8000/v1/audio/translations
```
Or check out more examples in [examples/online_serving](../../../examples/online_serving).
Or check out more examples in [examples/speech_to_text](../../../examples/speech_to_text).
!!! note
- If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking.
+1 -1
View File
@@ -17,7 +17,7 @@ Before you begin, ensure that you have the following:
## Installing the chart
This guide uses the Helm chart at [examples/online_serving/chart-helm](../../../examples/online_serving/chart-helm).
This guide uses the Helm chart at [examples/deployment/chart-helm](../../../examples/deployment/chart-helm).
To install the chart with the release name `test-vllm`:
+2 -2
View File
@@ -40,7 +40,7 @@ Deploy the following yaml file `lws.yaml`
command:
- sh
- -c
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
- "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2"
resources:
limits:
@@ -73,7 +73,7 @@ Deploy the following yaml file `lws.yaml`
command:
- sh
- -c
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
- "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
resources:
limits:
nvidia.com/gpu: "8"
@@ -36,7 +36,7 @@ pip install -U vllm \
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
```
1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_langchain.py](../../../examples/online_serving/retrieval_augmented_generation_with_langchain.py)
1. Use the script: [examples/applications/rag/retrieval_augmented_generation_with_langchain.py](../../../examples/applications/rag/retrieval_augmented_generation_with_langchain.py)
1. Run the script
@@ -74,7 +74,7 @@ pip install vllm \
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
```
1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_llamaindex.py](../../../examples/online_serving/retrieval_augmented_generation_with_llamaindex.py)
1. Use the script: [examples/applications/rag/retrieval_augmented_generation_with_llamaindex.py](../../../examples/applications/rag/retrieval_augmented_generation_with_llamaindex.py)
1. Run the script:
+2 -2
View File
@@ -59,7 +59,7 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
python vllm/examples/applications/chatbot/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \
--port 8811 \
--model-url http://localhost:8081/v1 \
@@ -305,7 +305,7 @@ It is also possible to access the Llama-3 service with a separate GUI frontend,
echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
python vllm/examples/applications/api_client/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \
--port 8811 \
--model-url http://$ENDPOINT/v1 \
+1 -1
View File
@@ -20,7 +20,7 @@ pip install vllm streamlit openai
vllm serve Qwen/Qwen1.5-0.5B-Chat
```
1. Use the script: [examples/online_serving/streamlit_openai_chatbot_webserver.py](../../../examples/online_serving/streamlit_openai_chatbot_webserver.py)
1. Use the script: [examples/applications/chatbot/streamlit_openai_chatbot_webserver.py](../../../examples/applications/chatbot/streamlit_openai_chatbot_webserver.py)
1. Start the streamlit web UI and start to chat:
+4 -4
View File
@@ -78,7 +78,7 @@ Key points from the example YAML:
- sh
- -c
- >
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=2;
bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh leader --ray_cluster_size=2;
python3 -m vllm.entrypoints.openai.api_server
--port 8080
--model meta-llama/Llama-3.1-405B-Instruct
@@ -93,7 +93,7 @@ Key points from the example YAML:
- sh
- -c
- >
bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)
bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)
```
---
@@ -144,7 +144,7 @@ spec:
command:
- sh
- -c
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=2;
- "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh leader --ray_cluster_size=2;
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline-parallel-size 2"
resources:
limits:
@@ -178,7 +178,7 @@ spec:
command:
- sh
- -c
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)"
- "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)"
resources:
limits:
nvidia.com/gpu: "8"
+3 -3
View File
@@ -2,6 +2,6 @@
vLLM's examples are split into three categories:
- If you are using vLLM from within Python code, see the [Offline Inference](../../examples/offline_inference) section.
- If you are using vLLM from an HTTP application or client, see the [Online Serving](../../examples/online_serving) section.
- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the [Others](../../examples/others) section.
- If you are using vLLM from within Python code, see the [Offline Inference](.) section.
- If you are using vLLM from an HTTP application or client, see the [Online Serving](.) section.
- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the [Others](.) section.
+1 -1
View File
@@ -47,7 +47,7 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
```bash
python examples/offline_inference/llm_engine_example.py \
python examples/deployment/llm_engine_example.py \
--model TheBloke/Llama-2-7b-Chat-AWQ \
--quantization awq
```
+1 -1
View File
@@ -58,7 +58,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:
```bash
python examples/offline_inference/llm_engine_example.py \
python examples/deployment/llm_engine_example.py \
--model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
```
+1 -1
View File
@@ -157,7 +157,7 @@ OpenAI Python client library does not officially support `reasoning` attribute f
print(content, end="", flush=True)
```
Remember to check whether the `reasoning` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
Remember to check whether the `reasoning` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/reasoning/openai_chat_completion_with_reasoning_streaming.py).
## Tool Calling
@@ -88,7 +88,7 @@ vllm serve facebook/opt-13b \
-tp=8
```
By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the [examples/online_serving/run_cluster.sh](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/run_cluster.sh) helper script.
By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the [examples/ray_serving/run_cluster.sh](https://github.com/vllm-project/vllm/blob/main/examples/ray_serving/run_cluster.sh) helper script.
--8<-- [end:supported-features]
--8<-- [start:distributed-backend]
+4 -4
View File
@@ -14,7 +14,7 @@ To install `tensorizer`, run `pip install vllm[tensorizer]`.
## The basics
To load a model using Tensorizer, the model first needs to be serialized by
Tensorizer. [The example script](../../examples/others/tensorize_vllm_model.md) takes care of this process.
Tensorizer. [The example script](../../../examples/features/tensorize_vllm_model.py) takes care of this process.
Let's walk through a basic example by serializing `facebook/opt-125m` using the script, and then loading it for inference.
@@ -25,7 +25,7 @@ CLI arguments. The docstring for the script itself explains the CLI args
and how to use it properly in great detail, and we'll use one of the examples from the docstring directly, assuming we want to serialize and save our model at our S3 bucket example `s3://my-bucket`:
```bash
python examples/others/tensorize_vllm_model.py \
python examples/features/tensorize_vllm_model.py \
--model facebook/opt-125m \
serialize \
--serialized-directory s3://my-bucket \
@@ -35,7 +35,7 @@ python examples/others/tensorize_vllm_model.py \
This saves the model tensors at `s3://my-bucket/vllm/facebook/opt-125m/v1`. If you intend on applying a LoRA adapter to your tensorized model, you can pass the HF id of the LoRA adapter in the above command, and the artifacts will be saved there too:
```bash
python examples/others/tensorize_vllm_model.py \
python examples/features/tensorize_vllm_model.py \
--model facebook/opt-125m \
--lora-path <lora_id> \
serialize \
@@ -71,7 +71,7 @@ llm = LLM(
As an example, CPU concurrency can be limited when serializing with `tensorizer` via the `limit_cpu_concurrency` parameter in the initializer for `TensorSerializer`. To set `limit_cpu_concurrency` to some arbitrary value, you would do so like this when serializing:
```bash
python examples/others/tensorize_vllm_model.py \
python examples/features/tensorize_vllm_model.py \
--model facebook/opt-125m \
--lora-path <lora_id> \
serialize \
+3 -3
View File
@@ -251,7 +251,7 @@ The following extra parameters are supported:
Our Responses API is compatible with [OpenAI's Responses API](https://platform.openai.com/docs/api-reference/responses);
you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
Code example: [examples/online_serving/openai_responses_client_with_tools.py](../../examples/tool_calling/openai_responses_client_with_tools.py)
Code example: [examples/tool_calling/openai_responses_client_with_tools.py](../../examples/tool_calling/openai_responses_client_with_tools.py)
#### Extra parameters
@@ -456,8 +456,8 @@ Audio must be sent as base64-encoded PCM16 audio at 16kHz sample rate, mono chan
#### Example Clients
- [openai_realtime_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_client.py) - Upload and transcribe an audio file
- [openai_realtime_microphone_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_microphone_client.py) - Gradio demo for live microphone transcription
- [openai_realtime_client.py](https://github.com/vllm-project/vllm/tree/main/examples/speech_to_text/realtime/openai_realtime_client.py) - Upload and transcribe an audio file
- [openai_realtime_microphone_client.py](https://github.com/vllm-project/vllm/tree/main/examples/speech_to_text/realtime/openai_realtime_microphone_client.py) - Gradio demo for live microphone transcription
### Tokenizer API
@@ -5,7 +5,7 @@ Start vLLM API server:
vllm serve meta-llama/Llama-2-7b-chat-hf
Start Gradio OpenAI Chatbot Webserver:
python examples/online_serving/gradio_openai_chatbot_webserver.py \
python examples/applications/chatbot/gradio_openai_chatbot_webserver.py \
-m meta-llama/Llama-2-7b-chat-hf
Note that `pip install --upgrade gradio` is needed to run this example.
@@ -6,7 +6,7 @@ Start vLLM API server:
--model meta-llama/Llama-2-7b-chat-hf
Start Webserver:
python examples/online_serving/gradio_webserver.py
python examples/applications/chatbot/gradio_webserver.py
Note that `pip install --upgrade gradio` is needed to run this example.
More details: https://github.com/gradio-app/gradio
@@ -8,7 +8,7 @@ token-by-token output in offline inference scenarios. It demonstrates DELTA mode
streaming where you receive new tokens as they are generated.
Usage:
python examples/offline_inference/async_llm_streaming.py
python examples/deployment/async_llm_streaming.py
"""
import asyncio
@@ -33,7 +33,7 @@ https://github.com/coreweave/tensorizer
To serialize a model, install vLLM from source, then run something
like this from the root level of this repository:
python examples/others/tensorize_vllm_model.py \
python examples/features/tensorize_vllm_model.py \
--model facebook/opt-125m \
serialize \
--serialized-directory s3://my-bucket \
@@ -53,7 +53,7 @@ providing a `--keyfile` argument.
To deserialize a model, you can run something like this from the root
level of this repository:
python examples/others/tensorize_vllm_model.py \
python examples/features/tensorize_vllm_model.py \
--model EleutherAI/gpt-j-6B \
--dtype float16 \
deserialize \
@@ -71,11 +71,11 @@ shard's rank. Sharded models serialized with this script will be named as
model-rank-%03d.tensors
For more information on the available arguments for serializing, run
`python -m examples.others.tensorize_vllm_model serialize --help`.
`python -m examples.features.tensorize_vllm_model serialize --help`.
Or for deserializing:
`python examples/others/tensorize_vllm_model.py deserialize --help`.
`python examples/features/tensorize_vllm_model.py deserialize --help`.
Once a model is serialized, tensorizer can be invoked with the `LLM` class
directly to load models:
@@ -100,7 +100,7 @@ vllm serve s3://my-bucket/vllm/facebook/opt-125m/v1 \
In order to see all of the available arguments usable to configure
loading with tensorizer that are given to `TensorizerConfig`, run:
`python examples/others/tensorize_vllm_model.py deserialize --help`
`python examples/features/tensorize_vllm_model.py deserialize --help`
under the `tensorizer options` section. These can also be used for
deserialization in this example script, although `--tensorizer-uri` and
+1 -1
View File
@@ -43,7 +43,7 @@ Both platforms provide equivalent monitoring capabilities:
First, navigate to this example's directory:
```bash
cd examples/online_serving/dashboards
cd examples/observability/dashboards
```
### Grafana
-26
View File
@@ -1,26 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from openai import APIConnectionError, OpenAI
from openai.pagination import SyncPage
from openai.types.model import Model
def get_first_model(client: OpenAI) -> str:
"""
Get the first model from the vLLM server.
"""
try:
models: SyncPage[Model] = client.models.list()
except APIConnectionError as e:
raise RuntimeError(
"Failed to get the list of models from the vLLM server at "
f"{client.base_url} with API key {client.api_key}. Check\n"
"1. the server is running\n"
"2. the server URL is correct\n"
"3. the API key is correct"
) from e
if len(models.data) == 0:
raise RuntimeError(f"No models found on the vLLM server at {client.base_url}")
return models.data[0].id
@@ -119,7 +119,7 @@ echo " - API Key: $API_KEY"
echo " - Native Pooling: $POOLING_TYPE | Cross-chunk: MEAN"
echo ""
echo "🧪 Test the server with:"
echo " python examples/online_serving/openai_embedding_long_text/client.py"
echo " python examples/pooling/embed/openai_embedding_long_text/client.py"
echo ""
echo "📚 Enhanced features enabled:"
echo " ✅ Intelligent native pooling type detection"
@@ -15,14 +15,14 @@ Start the vLLM server:
Then run this script:
# Use the built-in sample audio
python examples/online_serving/openai_lid_client.py
python examples/speech_to_text/lid/openai_lid_client.py
# Use your own audio file(s)
python examples/online_serving/openai_lid_client.py \
python examples/speech_to_text/lid/openai_lid_client.py \
--audio_paths audio_en.wav audio_zh.wav audio_fr.wav
# Batch-identify multiple files in one run
python examples/online_serving/openai_lid_client.py \
python examples/speech_to_text/lid/openai_lid_client.py \
--audio_paths /path/to/dir/*.wav
Requirements:
+1 -1
View File
@@ -41,7 +41,7 @@ compressed-tensors == 0.15.0.1 # required for compressed-tensors
depyf==0.20.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files
python-json-logger # Used by logging as per examples/others/logging_configuration.md
python-json-logger # Used by logging as per examples/features/logging_configuration.md
ninja # Required for xgrammar, rocm, tpu, xpu
pybase64 # fast base64 implementation
cbor2 # Required for cross-language serialization of hashable objects
+1 -1
View File
@@ -184,7 +184,7 @@ def test_tp2_serialize_and_deserialize_lora(
result = subprocess.run(
[
sys.executable,
f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py",
f"{VLLM_PATH}/examples/features/tensorize_vllm_model.py",
"--model",
MODEL_PATH,
"--lora-path",
@@ -460,7 +460,7 @@ async def test_serialize_and_serve_entrypoints(tmp_path):
result = subprocess.run(
[
sys.executable,
f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py",
f"{VLLM_PATH}/examples/features/tensorize_vllm_model.py",
"--model",
model_ref,
"serialize",
+1 -1
View File
@@ -139,6 +139,6 @@ class VideoAsset:
"""
Read audio data from the video asset, used in Qwen2.5-Omni examples.
See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
See also: examples/generate/multimodal/qwen2_5_omni/only_thinker.py
"""
return load_audio_pyav(self.video_path, sr=sampling_rate)[0]
@@ -211,7 +211,7 @@ class TensorizerConfig(MutableMapping):
encryption_keyfile: File path to a binary file containing a
binary key to use for decryption. `None` (the default) means
no decryption. See the example script in
examples/others/tensorize_vllm_model.py.
examples/features/tensorize_vllm_model.py.
s3_access_key_id: The access key for the S3 bucket. Can also be set via
the S3_ACCESS_KEY_ID environment variable.
s3_secret_access_key: The secret access key for the S3 bucket. Can also
@@ -579,7 +579,7 @@ def tensorizer_weights_iterator(
"loading on vLLM, as tensorizer is forced to load to CPU. "
"Consider deserializing a vLLM model instead for faster "
"load times. See the "
"examples/others/tensorize_vllm_model.py example script "
"examples/features/tensorize_vllm_model.py example script "
"for serializing vLLM models."
)
@@ -73,7 +73,7 @@ class TensorizerLoader(BaseModelLoader):
"""Load a serialized model with tensorizer to the CPU.
This is only necessary when the model isn't vLLM-tensorized (see
examples/others/tensorize_vllm_model.py) This should still
examples/features/tensorize_vllm_model.py) This should still
be faster than default HuggingFace loading, but will be slower than
loading a vLLM-tensorized model.
"""
@@ -104,7 +104,7 @@ class TensorizerLoader(BaseModelLoader):
"""Load serialized model weights with tensorizer.
Expects a vLLM-tensorized model. See the
examples/others/tensorize_vllm_model.py example script
examples/features/tensorize_vllm_model.py example script
for serializing vLLM models."""
if is_vllm_tensorized(self.tensorizer_config):
tensorizer_config = self._patch_tensorizer_config(model_config)