From 3ef3b217a8408a6ad8630070b5ad07839660de75 Mon Sep 17 00:00:00 2001
From: Deepak Gowda Doddbele Aswatha Narayana <deepak.narayana@intel.com>
Date: Tue, 25 Feb 2025 16:36:43 -0800
Subject: [PATCH 1/3] Update test_suite.sh

---
 test_suite.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test_suite.sh b/test_suite.sh
index e9850fb9a32ee..a0ce96a865dc0 100644
--- a/test_suite.sh
+++ b/test_suite.sh
@@ -4,8 +4,12 @@ bash run_offline.sh -m $ModelName -i snowscat --multiple_prompts --skip_warmup
 bash run_offline.sh -m $ModelName -i snowscat --skip_warmup
 bash run_offline.sh -m $ModelName -i synthetic --multiple_prompts --skip_warmup
 bash run_offline.sh -m $ModelName -i synthetic --skip_warmup
+bash run_offline.sh -m $ModelName -v --skip_warmup
+bash run_offline.sh -m $ModelName -v --multiple_prompts --skip_warmup
 # with warmups
 bash run_offline.sh -m $ModelName -i snowscat --multiple_prompts
 bash run_offline.sh -m $ModelName -i snowscat
 bash run_offline.sh -m $ModelName -i synthetic --multiple_prompts
 bash run_offline.sh -m $ModelName -i synthetic
+bash run_offline.sh -m $ModelName -v
+bash run_offline.sh -m $ModelName -v --multiple_prompts

From a3e61828d2ec333c7dac1f106a16d5568dc14450 Mon Sep 17 00:00:00 2001
From: Deepak Gowda Doddbele Aswatha Narayana <deepak.narayana@intel.com>
Date: Tue, 25 Feb 2025 16:37:31 -0800
Subject: [PATCH 2/3] Update run_offline.sh

---
 run_offline.sh | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/run_offline.sh b/run_offline.sh
index 6c785b9014937..5f7af37ff6125 100644
--- a/run_offline.sh
+++ b/run_offline.sh
@@ -1,10 +1,11 @@
-set -ex
+# set -ex
 
 usage() {
     echo "Usage: $0"
     echo "Options:"
     echo "  --model|-m                    Model path lub model stub"
     echo "  --image_type|-i               Type model: snowscat synthetic"
+    echo "  --video                       URL of the video file"
     echo "  --iter                        number of iterations(Default:1)"
     exit 1
 }
@@ -31,6 +32,15 @@ while [[ $# -gt 0 ]]; do
         ImageType=$2
         shift 2
         ;;
+    --video | -v)
+        if [[ -n "$2" && ! "$2" =~ ^- ]]; then    # Assign URL if provided
+            video=$2
+            shift 2
+        else
+            video="https://huggingface.co/spaces/merve/llava-interleave/resolve/main/cats_1.mp4"   # video set to default video URL file
+            shift 1
+        fi
+        ;;  
     --iter)
         iter=$2
         shift 2
@@ -44,6 +54,7 @@ done
 #Set Default values
 iter=${iter:-1}
 ImageType=${ImageType:-"snowscat"}
+video=${video} 
 
 if [[ -n $HELP ]]; then
     usage
@@ -71,6 +82,13 @@ if [[ $ImageType == "snowscat" ]] && [[ ! $(md5sum /tmp/snowscat-H3oXiq7_bII-uns
 'wb').write(r.content); image = Image.open(f'./{filename}'); image = image.resize((1200, 600)); image.save(f'/tmp/{filename}')"
 fi
 
+if [[ -n "$video" ]]; then
+    filename=$(basename "$video")
+    echo "Downloading Video $filename from $video"
+    wget -O /tmp/$filename "$video"
+    videofile="/tmp/$filename"
+fi
+
 if [[ -n $InstallVLLM ]]; then
     git clone https://github.com/HabanaAI/vllm-fork.git -b sarkar/qwen2
     cd vllm-fork
@@ -97,5 +115,11 @@ if [[ "$model" == *"Qwen2"* ]]; then
     export WORKAROUND=1
 fi
 
-ARGS="-m $model -i $ImageType --iter $iter $EXTRAARGS"
+if [[ -n "$video" ]]; then
+    ARGS="-m $model -v $videofile --iter $iter $EXTRAARGS"
+else
+    ARGS="-m $model -i $ImageType --iter $iter $EXTRAARGS"
+fi
+
+
 python offline_inferece.py $ARGS

From 8956cb6725bf579026d7f2f33f1f0a084664c9f9 Mon Sep 17 00:00:00 2001
From: Deepak Gowda Doddbele Aswatha Narayana <deepak.narayana@intel.com>
Date: Tue, 25 Feb 2025 16:37:49 -0800
Subject: [PATCH 3/3] Update offline_inferece.py

---
 offline_inferece.py | 106 +++++++++++++++++++++++++++++++-------------
 1 file changed, 74 insertions(+), 32 deletions(-)

diff --git a/offline_inferece.py b/offline_inferece.py
index 188cc28759796..e5f7267bc404d 100644
--- a/offline_inferece.py
+++ b/offline_inferece.py
@@ -2,6 +2,8 @@
 from vllm import SamplingParams
 from vllm.assets.image import ImageAsset
 import PIL
+import cv2
+from PIL import Image
 
 import argparse
 
@@ -10,6 +12,7 @@
 # Add arguments
 parser.add_argument("-m", "--model", help="model name or path")
 parser.add_argument("-i", "--image", help="type of image")
+parser.add_argument("-v", "--video", help="Video Input")
 parser.add_argument(
     "--multiple_prompts", action="store_true", help="to run with multiple prompts"
 )
@@ -18,7 +21,25 @@
 # Parse the arguments
 args = parser.parse_args()
 
-# Load the image
+# Process video input and select specified number of evenly distributed frames
+
+def sample_frames(path, num_frames):
+    video = cv2.VideoCapture(path)
+    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    interval = total_frames // num_frames
+    frames = []
+    for i in range(total_frames):
+        ret, frame = video.read()
+        pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        if not ret:
+            continue
+        if i % interval == 0:
+            pil_img = pil_img.resize((256, 256))
+            frames.append(pil_img)
+    video.release()
+    return frames[:num_frames]
+
+# Load the image / Video
 
 if args.image == "synthetic":
     image = ImageAsset("stop_sign").pil_image
@@ -27,8 +48,10 @@
 elif args.image == "snowscat":
     filename = "/tmp/snowscat-H3oXiq7_bII-unsplash.jpg"
     image = PIL.Image.open(filename)
+elif args.video:
+    video = sample_frames(args.video, 50)
 else:
-    print(f"unknow image {args.image}")
+    print(f"unknow image/Video Input {args.image} {args.video}")
     exit
 
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
@@ -41,43 +64,62 @@
 mdl = args.model
 multiple_prompt = args.multiple_prompts
 
-question = "Describe this image."
-# Prompt example: https://docs.vllm.ai/en/v0.6.2/getting_started/examples/offline_inference_vision_language.html
-if "Qwen2" in mdl:
-    llm = LLM(
-        model=mdl,
-        enforce_eager=False,
-        max_model_len=32768,
-        max_num_seqs=5,
-        limit_mm_per_prompt={"image": 1},
-    )
-    prompt = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{question}g<|im_end|>\n<|im_start|>assistant\n"
+if args.video:
+    question = "Describe this video."
+    llm = LLM(model=mdl,
+        enforce_eager=True,
+        dtype='bfloat16' ,
+        gpu_memory_utilization=0.6)
+
+    prompt = f'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>{question}<|im_end|>\n<|im_start|>assistant\n'
 
     if multiple_prompt:
         batch_data = [
-            {"prompt": prompt, "multi_modal_data": {"image": image}},
+            {"prompt": prompt, "multi_modal_data": {"video": video}},
             {
-                "prompt": "<|im_start|>system\nYou are a nice person.<|im_end|>\n<|im_start|>user\nTell me about you.<|im_end|>\n<|im_start|>assistant\n"
+                "prompt": "<|im_start|>system\nYou are a nice person.<|im_end|>\n<|im_start|>user\ntell me about future of global warming.<|im_end|>\n<|im_start|>assistant\n"
             },
-            {"prompt": prompt, "multi_modal_data": {"image": image}},
+            {"prompt": prompt, "multi_modal_data": {"video": video}},
         ]
     else:
-        batch_data = {"prompt": prompt, "multi_modal_data": {"image": image}}
-
-elif "Llama-3.2" in mdl:
-    llm = LLM(
-        model=mdl,
-        max_model_len=2048,
-        max_num_seqs=64,
-        tensor_parallel_size=1,
-        num_scheduler_steps=32,
-        max_num_prefill_seqs=4,
-    )
-    from vllm import TextPrompt
-    batch_data = TextPrompt(prompt=f"<|image|><|begin_of_text|>{question}")
-    batch_data["multi_modal_data"] = {"image": image}
-else:
-    print(f"{mdl} is not known model?")
+        batch_data = {"prompt": prompt, "multi_modal_data": {"video": video}}
+
+else :
+    question = "Describe this image."
+    # Prompt example: https://docs.vllm.ai/en/v0.6.2/getting_started/examples/offline_inference_vision_language.html
+    if "Qwen2" in mdl:
+        llm = LLM(
+            model=mdl,
+            enforce_eager=False,
+            max_model_len=32768,
+            max_num_seqs=5,
+            limit_mm_per_prompt={"image": 1},
+        )
+        prompt = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{question}g<|im_end|>\n<|im_start|>assistant\n"
+
+        if multiple_prompt:
+            batch_data = [
+                {"prompt": prompt, "multi_modal_data": {"image": image}},
+                {
+                    "prompt": "<|im_start|>system\nYou are a nice person.<|im_end|>\n<|im_start|>user\nTell me about you.<|im_end|>\n<|im_start|>assistant\n"
+                },
+                {"prompt": prompt, "multi_modal_data": {"image": image}},
+            ]
+        else:
+            batch_data = {"prompt": prompt, "multi_modal_data": {"image": image}}
+
+    elif "Llama-3.2" in mdl:
+        llm = LLM(
+            model=mdl,
+            max_model_len=2048,
+            max_num_seqs=64,
+            tensor_parallel_size=1,
+            num_scheduler_steps=32,
+            max_num_prefill_seqs=4,
+        )
+        prompt = f"<|image|><|begin_of_text|>{question}"
+    else:
+        print(f"{mdl} is not known model?")
 
 for i in range(int(args.iter)):
     print(f"==ITER : [{i}]")