From 3ef3b217a8408a6ad8630070b5ad07839660de75 Mon Sep 17 00:00:00 2001 From: Deepak Gowda Doddbele Aswatha Narayana Date: Tue, 25 Feb 2025 16:36:43 -0800 Subject: [PATCH 1/3] Update test_suite.sh --- test_suite.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test_suite.sh b/test_suite.sh index e9850fb9a32ee..a0ce96a865dc0 100644 --- a/test_suite.sh +++ b/test_suite.sh @@ -4,8 +4,12 @@ bash run_offline.sh -m $ModelName -i snowscat --multiple_prompts --skip_warmup bash run_offline.sh -m $ModelName -i snowscat --skip_warmup bash run_offline.sh -m $ModelName -i synthetic --multiple_prompts --skip_warmup bash run_offline.sh -m $ModelName -i synthetic --skip_warmup +bash run_offline.sh -m $ModelName -v --skip_warmup +bash run_offline.sh -m $ModelName -v --multiple_prompts --skip_warmup # with warmups bash run_offline.sh -m $ModelName -i snowscat --multiple_prompts bash run_offline.sh -m $ModelName -i snowscat bash run_offline.sh -m $ModelName -i synthetic --multiple_prompts bash run_offline.sh -m $ModelName -i synthetic +bash run_offline.sh -m $ModelName -v +bash run_offline.sh -m $ModelName -v --multiple_prompts From a3e61828d2ec333c7dac1f106a16d5568dc14450 Mon Sep 17 00:00:00 2001 From: Deepak Gowda Doddbele Aswatha Narayana Date: Tue, 25 Feb 2025 16:37:31 -0800 Subject: [PATCH 2/3] Update run_offline.sh --- run_offline.sh | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/run_offline.sh b/run_offline.sh index 6c785b9014937..5f7af37ff6125 100644 --- a/run_offline.sh +++ b/run_offline.sh @@ -1,10 +1,11 @@ -set -ex +# set -ex usage() { echo "Usage: $0" echo "Options:" echo " --model|-m Model path lub model stub" echo " --image_type|-i Type model: snowscat synthetic" + echo " --video URL of the video file" echo " --iter number of iterations(Default:1)" exit 1 } @@ -31,6 +32,15 @@ while [[ $# -gt 0 ]]; do ImageType=$2 shift 2 ;; + --video | -v) + if [[ -n "$2" && ! "$2" =~ ^- ]]; then # Assign URL if provided + video=$2 + shift 2 + else + video="https://huggingface.co/spaces/merve/llava-interleave/resolve/main/cats_1.mp4" # video set to default video URL file + shift 1 + fi + ;; --iter) iter=$2 shift 2 @@ -44,6 +54,7 @@ done #Set Default values iter=${iter:-1} ImageType=${ImageType:-"snowscat"} +video=${video} if [[ -n $HELP ]]; then usage @@ -71,6 +82,13 @@ if [[ $ImageType == "snowscat" ]] && [[ ! $(md5sum /tmp/snowscat-H3oXiq7_bII-uns 'wb').write(r.content); image = Image.open(f'./{filename}'); image = image.resize((1200, 600)); image.save(f'/tmp/{filename}')" fi +if [[ -n "$video" ]]; then + filename=$(basename "$video") + echo "Downloading Video $filename from $video" + wget -O /tmp/$filename "$video" + videofile="/tmp/$filename" +fi + if [[ -n $InstallVLLM ]]; then git clone https://github.com/HabanaAI/vllm-fork.git -b sarkar/qwen2 cd vllm-fork @@ -97,5 +115,11 @@ if [[ "$model" == *"Qwen2"* ]]; then export WORKAROUND=1 fi -ARGS="-m $model -i $ImageType --iter $iter $EXTRAARGS" +if [[ -n "$video" ]]; then + ARGS="-m $model -v $videofile --iter $iter $EXTRAARGS" +else + ARGS="-m $model -i $ImageType --iter $iter $EXTRAARGS" +fi + + python offline_inferece.py $ARGS From 8956cb6725bf579026d7f2f33f1f0a084664c9f9 Mon Sep 17 00:00:00 2001 From: Deepak Gowda Doddbele Aswatha Narayana Date: Tue, 25 Feb 2025 16:37:49 -0800 Subject: [PATCH 3/3] Update offline_inferece.py --- offline_inferece.py | 106 +++++++++++++++++++++++++++++++------------- 1 file changed, 74 insertions(+), 32 deletions(-) diff --git a/offline_inferece.py b/offline_inferece.py index 188cc28759796..e5f7267bc404d 100644 --- a/offline_inferece.py +++ b/offline_inferece.py @@ -2,6 +2,8 @@ from vllm import SamplingParams from vllm.assets.image import ImageAsset import PIL +import cv2 +from PIL import Image import argparse @@ -10,6 +12,7 @@ # Add arguments parser.add_argument("-m", "--model", help="model name or path") parser.add_argument("-i", "--image", help="type of image") +parser.add_argument("-v", "--video", help="Video Input") parser.add_argument( "--multiple_prompts", action="store_true", help="to run with multiple prompts" ) @@ -18,7 +21,25 @@ # Parse the arguments args = parser.parse_args() -# Load the image +# Process video input and select specified number of evenly distributed frames + +def sample_frames(path, num_frames): + video = cv2.VideoCapture(path) + total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) + interval = total_frames // num_frames + frames = [] + for i in range(total_frames): + ret, frame = video.read() + pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) + if not ret: + continue + if i % interval == 0: + pil_img = pil_img.resize((256, 256)) + frames.append(pil_img) + video.release() + return frames[:num_frames] + +# Load the image / Video if args.image == "synthetic": image = ImageAsset("stop_sign").pil_image @@ -27,8 +48,10 @@ elif args.image == "snowscat": filename = "/tmp/snowscat-H3oXiq7_bII-unsplash.jpg" image = PIL.Image.open(filename) +elif args.video: + video = sample_frames(args.video, 50) else: - print(f"unknow image {args.image}") + print(f"unknow image/Video Input {args.image} {args.video}") exit sampling_params = SamplingParams(temperature=0.8, top_p=0.95) @@ -41,43 +64,62 @@ mdl = args.model multiple_prompt = args.multiple_prompts -question = "Describe this image." -# Prompt example: https://docs.vllm.ai/en/v0.6.2/getting_started/examples/offline_inference_vision_language.html -if "Qwen2" in mdl: - llm = LLM( - model=mdl, - enforce_eager=False, - max_model_len=32768, - max_num_seqs=5, - limit_mm_per_prompt={"image": 1}, - ) - prompt = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{question}g<|im_end|>\n<|im_start|>assistant\n" +if args.video: + question = "Describe this video." + llm = LLM(model=mdl, + enforce_eager=True, + dtype='bfloat16' , + gpu_memory_utilization=0.6) + + prompt = f'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>{question}<|im_end|>\n<|im_start|>assistant\n' if multiple_prompt: batch_data = [ - {"prompt": prompt, "multi_modal_data": {"image": image}}, + {"prompt": prompt, "multi_modal_data": {"video": video}}, { - "prompt": "<|im_start|>system\nYou are a nice person.<|im_end|>\n<|im_start|>user\nTell me about you.<|im_end|>\n<|im_start|>assistant\n" + "prompt": "<|im_start|>system\nYou are a nice person.<|im_end|>\n<|im_start|>user\ntell me about future of global warming.<|im_end|>\n<|im_start|>assistant\n" }, - {"prompt": prompt, "multi_modal_data": {"image": image}}, + {"prompt": prompt, "multi_modal_data": {"video": video}}, ] else: - batch_data = {"prompt": prompt, "multi_modal_data": {"image": image}} - -elif "Llama-3.2" in mdl: - llm = LLM( - model=mdl, - max_model_len=2048, - max_num_seqs=64, - tensor_parallel_size=1, - num_scheduler_steps=32, - max_num_prefill_seqs=4, - ) - from vllm import TextPrompt - batch_data = TextPrompt(prompt=f"<|image|><|begin_of_text|>{question}") - batch_data["multi_modal_data"] = {"image": image} -else: - print(f"{mdl} is not known model?") + batch_data = {"prompt": prompt, "multi_modal_data": {"video": video}} + +else : + question = "Describe this image." + # Prompt example: https://docs.vllm.ai/en/v0.6.2/getting_started/examples/offline_inference_vision_language.html + if "Qwen2" in mdl: + llm = LLM( + model=mdl, + enforce_eager=False, + max_model_len=32768, + max_num_seqs=5, + limit_mm_per_prompt={"image": 1}, + ) + prompt = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{question}g<|im_end|>\n<|im_start|>assistant\n" + + if multiple_prompt: + batch_data = [ + {"prompt": prompt, "multi_modal_data": {"image": image}}, + { + "prompt": "<|im_start|>system\nYou are a nice person.<|im_end|>\n<|im_start|>user\nTell me about you.<|im_end|>\n<|im_start|>assistant\n" + }, + {"prompt": prompt, "multi_modal_data": {"image": image}}, + ] + else: + batch_data = {"prompt": prompt, "multi_modal_data": {"image": image}} + + elif "Llama-3.2" in mdl: + llm = LLM( + model=mdl, + max_model_len=2048, + max_num_seqs=64, + tensor_parallel_size=1, + num_scheduler_steps=32, + max_num_prefill_seqs=4, + ) + prompt = f"<|image|><|begin_of_text|>{question}" + else: + print(f"{mdl} is not known model?") for i in range(int(args.iter)): print(f"==ITER : [{i}]")