[nvbugs/5385972][nvbugs/5387423][Fix] Minor fix for llava_next/llava_onevision (#5998)

Signed-off-by: Mina Huai <121143971+MinaHuai@users.noreply.github.com>
This commit is contained in:
MinaHuai 2025-07-15 22:01:35 +08:00 committed by GitHub
parent ab1c54709d
commit 9ebc3ab9c4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 4 additions and 8 deletions

View File

@ -2647,7 +2647,7 @@ class MultimodalModelRunner:
) )
image = None image = None
elif self.model_type in ['llava_onevision']: elif self.model_type in ['llava_onevision']:
pre_prompt = "<|im_start|>user " pre_prompt = "<|im_start|>user " + "<video>" if self.args.video_path is not None else "<image>"
if input_text is None: if input_text is None:
input_text = "Question: which city is this? Answer:" if self.args.video_path is None else "Why is this video funny?" input_text = "Question: which city is this? Answer:" if self.args.video_path is None else "Why is this video funny?"
post_prompt = f"\n{input_text}<|im_end|><|im_start|>assistant\n" post_prompt = f"\n{input_text}<|im_end|><|im_start|>assistant\n"
@ -2658,7 +2658,7 @@ class MultimodalModelRunner:
text=prompt, text=prompt,
return_tensors="pt") return_tensors="pt")
else: else:
image = self.processor(videos=raw_image, image = self.processor(videos=list(raw_image),
text=prompt, text=prompt,
return_tensors="pt") return_tensors="pt")

View File

@ -596,12 +596,12 @@ def build_llava_engine(args):
args.output_dir, args.output_dir,
args.max_batch_size) args.max_batch_size)
if args.model_type == "llava_next": if args.model_type == "llava_next":
image_newline = model.image_newline.data image_newline = model.model.image_newline.data
tensor_img_newline = {"image_newline": image_newline} tensor_img_newline = {"image_newline": image_newline}
save_file(tensor_img_newline, save_file(tensor_img_newline,
os.path.join(args.output_dir, "image_newlines.safetensors")) os.path.join(args.output_dir, "image_newlines.safetensors"))
if args.model_type == "llava_onevision": if args.model_type == "llava_onevision":
image_newline = model.image_newline.data image_newline = model.model.image_newline.data
tensor_img_newline = {"image_newline": image_newline} tensor_img_newline = {"image_newline": image_newline}
save_file(tensor_img_newline, save_file(tensor_img_newline,
os.path.join(args.output_dir, "image_newlines.safetensors")) os.path.join(args.output_dir, "image_newlines.safetensors"))

View File

@ -432,15 +432,11 @@ examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-fl
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 SKIP (https://nvbugs/5380101) accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 SKIP (https://nvbugs/5380101)
test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8] SKIP (https://nvbugs/5380570) test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8] SKIP (https://nvbugs/5380570)
test_e2e.py::test_ptp_quickstart_advanced_8gpus[Nemotron-Ultra-253B-nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1] SKIP (https://nvbugs/5380570) test_e2e.py::test_ptp_quickstart_advanced_8gpus[Nemotron-Ultra-253B-nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1] SKIP (https://nvbugs/5380570)
triton_server/test_triton.py::test_llava_onevision[llava_onevision] SKIP (https://nvbugs/5385972)
examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4] SKIP (https://nvbugs/5385981) examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4] SKIP (https://nvbugs/5385981)
examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5385972)
examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-cnn_dailymail-Qwen2-VL-7B-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5385987) examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-cnn_dailymail-Qwen2-VL-7B-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5385987)
examples/test_multimodal.py::test_llm_multimodal_general[Phi-4-multimodal-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5385992) examples/test_multimodal.py::test_llm_multimodal_general[Phi-4-multimodal-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5385992)
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5377914) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5377914)
test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B] SKIP (https://nvbugs/5387375) test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B] SKIP (https://nvbugs/5387375)
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387423)
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387423)
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387422) examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387422)
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387424) examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387424)
test_e2e.py::test_ptp_quickstart SKIP (https://nvbugs/5387762) test_e2e.py::test_ptp_quickstart SKIP (https://nvbugs/5387762)