diff --git a/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_Qwen2VL.ipynb b/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_Qwen2VL.ipynb new file mode 100644 index 00000000000000..8a5aa6277b11b2 --- /dev/null +++ b/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_Qwen2VL.ipynb @@ -0,0 +1,1142 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_Qwen2VL.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Import OpenVINO Qwen2VL models from HuggingFace πŸ€— into Spark NLP πŸš€\n", + "\n", + "This notebook provides a detailed walkthrough on optimizing and importing Qwen2VL models from HuggingFace for use in Spark NLP, with [Intel OpenVINO toolkit](https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/overview.html). The focus is on converting the model to the OpenVINO format and applying precision optimizations (INT8 and INT4), to enhance the performance and efficiency on CPU platforms using [Optimum Intel](https://huggingface.co/docs/optimum/main/en/intel/inference).\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- OpenVINO support was introduced in `Spark NLP 5.4.0`, enabling high performance CPU inference for models. So please make sure you have upgraded to the latest Spark NLP release.\n", + "- Model quantization is a computationally expensive process, so it is recommended to use a runtime with more than 32GB memory for exporting the quantized model from HuggingFace.\n", + "- You can import Qwen2VL models via `Qwen2VL`. These models are usually under `Text Generation` category and have `Qwen2VL` in their labels.\n", + "- Reference: [Qwen2VL](https://huggingface.co/docs/transformers/model_doc/llama#transformers.Qwen2VL)\n", + "- Some [example models](https://huggingface.co/models?search=Qwen2VL)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Export and Save the HuggingFace model\n", + "\n", + "- Let's install `transformers` and `openvino` packages with other dependencies. You don't need `openvino` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.41.2`. This doesn't mean it won't work with the future release, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "import requests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "%pip install -qU \"openvino>=2024.4.0\" \"nncf>=2.13.0\"\n", + "%pip install -q \"sentencepiece\" \"tokenizers>=0.12.1\" \"transformers>=4.45.0\" \"gradio>=4.36\"\n", + "%pip install -q -U --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly openvino-tokenizers openvino openvino-genai\n", + "%pip install -q --upgrade huggingface_hub\n", + "%pip install -q --upgrade torch>=2.2.1\n", + "%pip install -q --upgrade qwen-vl-utils\n", + "\n", + "utility_files = [\"notebook_utils.py\", \"cmd_helper.py\"]\n", + "\n", + "from pathlib import Path\n", + "import requests\n", + "\n", + "if not Path(\"ov_qwen2_vl.py\").exists():\n", + " r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/qwen2-vl/ov_qwen2_vl.py\")\n", + " open(\"ov_qwen2_vl.py\", \"w\").write(r.text)\n", + "\n", + "if not Path(\"notebook_utils.py\").exists():\n", + " r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py\")\n", + " open(\"notebook_utils.py\", \"w\").write(r.text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.1 Convert the model to OpenVino" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4659f1c77b1b4fc28b2869cbed0ea309", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Dropdown(description='Model:', options=('Qwen/Qwen2-VL-2B-Instruct', 'Qwen/Qwen2-VL-7B-Instruct'), value='Qwen…" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from ov_qwen2_vl import model_selector\n", + "\n", + "model_id = model_selector()\n", + "\n", + "model_id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selected Qwen/Qwen2-VL-2B-Instruct\n" + ] + } + ], + "source": [ + "print(f\"Selected {model_id.value}\")\n", + "pt_model_id = model_id.value\n", + "model_dir = Path(pt_model_id.split(\"/\")[-1])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('test/Qwen2-VL-2B-Instruct')" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_dir" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "βŒ› Qwen/Qwen2-VL-2B-Instruct conversion started. Be patient, it may takes some time.\n", + "βŒ› Load Original model\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "afc3407b5faa4b8ea14e18fef35f69bb", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/2 [00:00 target_length:\n", + "/home/prabod/anaconda3/envs/pth23/lib/python3.9/site-packages/transformers/cache_utils.py:444: TracerWarning: Using len to get tensor shape might cause the trace to be incorrect. Recommended usage would be tensor.shape[0]. Passing a tensor of different shape might lead to errors or silently give incorrect results.\n", + " len(self.key_cache[layer_idx]) == 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "βœ… Language model successfully converted\n", + "βŒ› Weights compression with int4_asym mode started\n", + "INFO:nncf:Statistics of the bitwidth distribution:\n", + "┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n", + "β”‚ Weight compression mode β”‚ % all parameters (layers) β”‚ % ratio-defining parameters (layers) β”‚\n", + "┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━β”₯\n", + "β”‚ int8_asym β”‚ 15% (1 / 197) β”‚ 0% (0 / 196) β”‚\n", + "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n", + "β”‚ int4_asym β”‚ 85% (196 / 197) β”‚ 100% (196 / 196) β”‚\n", + "┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b4a917cc8701479f8a37567a90cb7f54", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "βœ… Weights compression finished\n",
+      "βŒ› Convert Image embedding model\n",
+      "βŒ› Weights compression with int4_asym mode started\n",
+      "INFO:nncf:Statistics of the bitwidth distribution:\n",
+      "┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n",
+      "β”‚ Weight compression mode   β”‚ % all parameters (layers)   β”‚ % ratio-defining parameters (layers)   β”‚\n",
+      "┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━β”₯\n",
+      "β”‚ int8_asym                 β”‚ 1% (1 / 130)                β”‚ 0% (0 / 129)                           β”‚\n",
+      "β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€\n",
+      "β”‚ int4_asym                 β”‚ 99% (129 / 130)             β”‚ 100% (129 / 129)                       β”‚\n",
+      "┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "041352b5ffce4e2886add167de6ca1ad",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "βœ… Weights compression finished\n",
+      "βœ… Image embedding model successfully converted\n",
+      "βœ… Qwen/Qwen2-VL-2B-Instruct model conversion finished. You can find results in test/Qwen2-VL-2B-Instruct\n"
+     ]
+    }
+   ],
+   "source": [
+    "from ov_qwen2_vl import convert_qwen2vl_model\n",
+    "import nncf\n",
+    "\n",
+    "compression_configuration = {\n",
+    "    \"mode\": nncf.CompressWeightsMode.INT4_ASYM,\n",
+    "    \"group_size\": 128,\n",
+    "    \"ratio\": 1.0,\n",
+    "}\n",
+    "\n",
+    "convert_qwen2vl_model(pt_model_id, model_dir, compression_configuration)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "class Qwen2ReshapePatches(nn.Module):\n",
+    "    def __init__(self,\n",
+    "                 temporal_patch_size: int = 2,\n",
+    "                 merge_size: int = 2,\n",
+    "                 patch_size: int = 14\n",
+    "                 ):\n",
+    "        super().__init__()\n",
+    "        self.temporal_patch_size = temporal_patch_size\n",
+    "        self.merge_size = merge_size\n",
+    "        self.patch_size = patch_size\n",
+    "\n",
+    "    def forward(self, patches, repetition_factor=1):\n",
+    "        # Repeat the patches along the first dimension\n",
+    "        patches = patches.repeat(repetition_factor, 1, 1, 1)\n",
+    "        channel = patches.shape[1]\n",
+    "        grid_t = patches.shape[0] // self.temporal_patch_size\n",
+    "        resized_height = patches.shape[2]\n",
+    "        resized_width = patches.shape[3]\n",
+    "        grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size\n",
+    "        patches = patches.reshape(\n",
+    "            grid_t,\n",
+    "            self.temporal_patch_size,\n",
+    "            channel,\n",
+    "            grid_h // self.merge_size,\n",
+    "            self.merge_size,\n",
+    "            self.patch_size,\n",
+    "            grid_w // self.merge_size,\n",
+    "            self.merge_size,\n",
+    "            self.patch_size,\n",
+    "        )\n",
+    "        patches = patches.permute(0, 3, 6, 4, 7, 2, 1, 5, 8)\n",
+    "        flatten_patches = patches.reshape(\n",
+    "            grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size\n",
+    "        )\n",
+    "\n",
+    "        return flatten_patches\n",
+    "\n",
+    "\n",
+    "patch_reshape_model = Qwen2ReshapePatches()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openvino as ov\n",
+    "\n",
+    "\n",
+    "ov_model = ov.convert_model(\n",
+    "            patch_reshape_model,\n",
+    "            example_input={\n",
+    "                \"patches\": torch.ones((1, 3, 1372, 2044), dtype=torch.float32),\n",
+    "                \"repetition_factor\": torch.tensor(2),\n",
+    "            }\n",
+    "        )\n",
+    "\n",
+    "# Save the OpenVINO model\n",
+    "ov.save_model(ov_model, model_dir/\"openvino_patch_reshape_model.xml\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers.models.qwen2_vl.modeling_qwen2_vl import VisionRotaryEmbedding\n",
+    "from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoConfig\n",
+    "\n",
+    "config = AutoConfig.from_pretrained(\"Qwen/Qwen2-VL-2B-Instruct\")\n",
+    "\n",
+    "\n",
+    "class RotaryEmbedding(nn.Module):\n",
+    "\n",
+    "    def __init__(self, embed_dim, spatial_merge_size):\n",
+    "        super().__init__()\n",
+    "        self._rotary_pos_emb = VisionRotaryEmbedding(embed_dim)\n",
+    "        self.spatial_merge_size = spatial_merge_size\n",
+    "    \n",
+    "    def forward(self, grid_thw):\n",
+    "        t, h, w = grid_thw\n",
+    "        pos_ids = []\n",
+    "        # for t, h, w in grid_thw:\n",
+    "\n",
+    "        hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)\n",
+    "        hpos_ids = hpos_ids.reshape(\n",
+    "            h // self.spatial_merge_size,\n",
+    "            self.spatial_merge_size,\n",
+    "            w // self.spatial_merge_size,\n",
+    "            self.spatial_merge_size,\n",
+    "        )\n",
+    "        hpos_ids = hpos_ids.permute(0, 2, 1, 3)\n",
+    "        hpos_ids = hpos_ids.flatten()\n",
+    "\n",
+    "        wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)\n",
+    "        wpos_ids = wpos_ids.reshape(\n",
+    "            h // self.spatial_merge_size,\n",
+    "            self.spatial_merge_size,\n",
+    "            w // self.spatial_merge_size,\n",
+    "            self.spatial_merge_size,\n",
+    "        )\n",
+    "        wpos_ids = wpos_ids.permute(0, 2, 1, 3)\n",
+    "        wpos_ids = wpos_ids.flatten()\n",
+    "        pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))\n",
+    "        pos_ids = torch.cat(pos_ids, dim=0)\n",
+    "        max_grid_size = grid_thw.max()\n",
+    "        rotary_pos_emb_full = self._rotary_pos_emb(max_grid_size)\n",
+    "        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)\n",
+    "        return rotary_pos_emb\n",
+    "\n",
+    "\n",
+    "\n",
+    "vision_rotary_embedding = RotaryEmbedding(config.vision_config.embed_dim // config.vision_config.num_heads // 2, config.vision_config.spatial_merge_size)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_1830471/1989675311.py:15: TracerWarning: Iterating over a tensor might cause the trace to be incorrect. Passing a tensor of different shape won't change the number of iterations executed (and might lead to errors or silently give incorrect results).\n",
+      "  t, h, w = grid_thw\n"
+     ]
+    }
+   ],
+   "source": [
+    "import openvino as ov\n",
+    "\n",
+    "vision_embedding_ov = ov.convert_model(\n",
+    "    vision_rotary_embedding,\n",
+    "    example_input={\n",
+    "        \"grid_thw\": torch.tensor([1, 98, 146]),\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "# Save the OpenVINO model\n",
+    "ov.save_model(vision_embedding_ov, model_dir/\"openvino_rotary_embeddings_model.xml\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class MergeMultiModalInputs(torch.nn.Module):\n",
+    "    def __init__(self,image_token_index=151655):\n",
+    "        super().__init__()\n",
+    "        self.image_token_index = image_token_index\n",
+    "\n",
+    "    def forward(\n",
+    "        self,\n",
+    "        vision_embeds,\n",
+    "        inputs_embeds,\n",
+    "        input_ids,\n",
+    "    ):\n",
+    "        image_features = vision_embeds\n",
+    "        inputs_embeds = inputs_embeds\n",
+    "        special_image_mask = (input_ids == self.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)\n",
+    "        # image_features = image_features.to(inputs_embeds.dtype)\n",
+    "        final_embedding = inputs_embeds.masked_scatter(special_image_mask, image_features)\n",
+    "\n",
+    "        return {\n",
+    "            \"inputs_embeds\": final_embedding\n",
+    "        }\n",
+    "\n",
+    "torch_model_merge = MergeMultiModalInputs()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openvino as ov\n",
+    "\n",
+    "# convert MergeMultiModalInputs to OpenVINO IR\n",
+    "ov_model_merge = ov.convert_model(\n",
+    "    torch_model_merge,\n",
+    "    example_input={\n",
+    "        \"vision_embeds\": torch.randn((3577, 1536), dtype=torch.float32),\n",
+    "        \"inputs_embeds\": torch.randn((1, 3602, 1536), dtype=torch.float32),\n",
+    "        \"input_ids\": torch.randint(0, 151656, (1, 3602), dtype=torch.long),\n",
+    "    }\n",
+    ")\n",
+    "ov.save_model(ov_model_merge, model_dir/\"openvino_multimodal_merge_model.xml\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1.2 Load openvino models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "LANGUAGE_MODEL_NAME = \"openvino_language_model.xml\"\n",
+    "IMAGE_EMBEDDING_NAME = \"openvino_vision_embeddings_model.xml\"\n",
+    "IMAGE_EMBEDDING_MERGER_NAME = \"openvino_vision_embeddings_merger_model.xml\"\n",
+    "TEXT_EMBEDDING_NAME = \"openvino_text_embeddings_model.xml\"\n",
+    "ROTARY_EMBEDDING_NAME = \"openvino_rotary_embeddings_model.xml\"\n",
+    "PATCH_RESHAPE_NAME = \"openvino_patch_reshape_model.xml\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openvino as ov\n",
+    "import gc\n",
+    "\n",
+    "core = ov.Core()\n",
+    "model_path = model_dir\n",
+    "\n",
+    "language_model = core.read_model(model_path / LANGUAGE_MODEL_NAME)\n",
+    "compiled_language_model = core.compile_model(language_model, \"CPU\")\n",
+    "request = compiled_language_model.create_infer_request()\n",
+    "\n",
+    "image_embedding = core.compile_model(model_path / IMAGE_EMBEDDING_NAME, \"CPU\")\n",
+    "image_embedding_merger = core.compile_model(model_path / IMAGE_EMBEDDING_MERGER_NAME, \"CPU\")\n",
+    "text_embedding = core.compile_model(model_path / TEXT_EMBEDDING_NAME, \"CPU\")\n",
+    "rotary_embedding = core.compile_model(model_path / ROTARY_EMBEDDING_NAME, \"CPU\")\n",
+    "patch_reshape = core.compile_model(model_path / PATCH_RESHAPE_NAME, \"CPU\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "βŒ› Check if all models are converted\n",
+      "βœ… All models are converted. You can find results in test/Qwen2-VL-2B-Instruct\n"
+     ]
+    }
+   ],
+   "source": [
+    "# check if all the models are converted\n",
+    "\n",
+    "print(\"βŒ› Check if all models are converted\")\n",
+    "language_model_path = model_dir / LANGUAGE_MODEL_NAME\n",
+    "image_embed_path = model_dir / IMAGE_EMBEDDING_NAME\n",
+    "image_merger_path = model_dir / IMAGE_EMBEDDING_MERGER_NAME\n",
+    "text_embed_path = model_dir / TEXT_EMBEDDING_NAME\n",
+    "rotary_embed_path = model_dir / ROTARY_EMBEDDING_NAME\n",
+    "patch_reshape_path = model_dir / PATCH_RESHAPE_NAME\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "if all(\n",
+    "    [\n",
+    "        language_model_path.exists(),\n",
+    "        image_embed_path.exists(),\n",
+    "        image_merger_path.exists(),\n",
+    "        text_embed_path.exists(),\n",
+    "        rotary_embed_path.exists(),\n",
+    "        patch_reshape_path.exists(),\n",
+    "    ]\n",
+    "):\n",
+    "    print(f\"βœ… All models are converted. You can find results in {model_dir}\")\n",
+    "else:\n",
+    "    print(\"❌ Not all models are converted. Please check the conversion process\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1.2 Copy assets to the assets folder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assets_dir = model_dir / \"assets\"\n",
+    "assets_dir.mkdir(exist_ok=True)\n",
+    "\n",
+    "# copy all the assets to the assets directory (json files, vocab files, etc.)\n",
+    "\n",
+    "import shutil\n",
+    "\n",
+    "# copy all json files\n",
+    "\n",
+    "for file in model_dir.glob(\"*.json\"):\n",
+    "    shutil.copy(file, assets_dir)\n",
+    "\n",
+    "    \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total 1.7G\n",
+      "-rw-rw-r-- 1 prabod prabod  392 Dec 10 06:55 added_tokens.json\n",
+      "drwxrwxr-x 2 prabod prabod 4.0K Dec 10 06:59 assets\n",
+      "-rw-rw-r-- 1 prabod prabod 1.1K Dec 10 06:55 chat_template.json\n",
+      "-rw-rw-r-- 1 prabod prabod 1.2K Dec 10 06:55 config.json\n",
+      "-rw-rw-r-- 1 prabod prabod 1.6M Dec 10 06:55 merges.txt\n",
+      "-rw-rw-r-- 1 prabod prabod 873M Dec 10 06:57 openvino_language_model.bin\n",
+      "-rw-rw-r-- 1 prabod prabod 3.5M Dec 10 06:57 openvino_language_model.xml\n",
+      "-rw-rw-r-- 1 prabod prabod   40 Dec 10 06:58 openvino_multimodal_merge_model.bin\n",
+      "-rw-rw-r-- 1 prabod prabod 9.8K Dec 10 06:58 openvino_multimodal_merge_model.xml\n",
+      "-rw-rw-r-- 1 prabod prabod  132 Dec 10 06:58 openvino_patch_reshape_model.bin\n",
+      "-rw-rw-r-- 1 prabod prabod  24K Dec 10 06:58 openvino_patch_reshape_model.xml\n",
+      "-rw-rw-r-- 1 prabod prabod  132 Dec 10 06:58 openvino_rotary_embeddings_model.bin\n",
+      "-rw-rw-r-- 1 prabod prabod  30K Dec 10 06:58 openvino_rotary_embeddings_model.xml\n",
+      "-rw-rw-r-- 1 prabod prabod 446M Dec 10 06:55 openvino_text_embeddings_model.bin\n",
+      "-rw-rw-r-- 1 prabod prabod 2.9K Dec 10 06:55 openvino_text_embeddings_model.xml\n",
+      "-rw-rw-r-- 1 prabod prabod 334M Dec 10 06:58 openvino_vision_embeddings_merger_model.bin\n",
+      "-rw-rw-r-- 1 prabod prabod 2.1M Dec 10 06:58 openvino_vision_embeddings_merger_model.xml\n",
+      "-rw-rw-r-- 1 prabod prabod 2.9M Dec 10 06:57 openvino_vision_embeddings_model.bin\n",
+      "-rw-rw-r-- 1 prabod prabod 4.4K Dec 10 06:57 openvino_vision_embeddings_model.xml\n",
+      "-rw-rw-r-- 1 prabod prabod  567 Dec 10 06:55 preprocessor_config.json\n",
+      "-rw-rw-r-- 1 prabod prabod  613 Dec 10 06:55 special_tokens_map.json\n",
+      "-rw-rw-r-- 1 prabod prabod 4.3K Dec 10 06:55 tokenizer_config.json\n",
+      "-rw-rw-r-- 1 prabod prabod  11M Dec 10 06:55 tokenizer.json\n",
+      "-rw-rw-r-- 1 prabod prabod 2.7M Dec 10 06:55 vocab.json\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls -lh {model_dir}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total 14M\n",
+      "-rw-rw-r-- 1 prabod prabod  392 Dec 10 07:05 added_tokens.json\n",
+      "-rw-rw-r-- 1 prabod prabod 1.1K Dec 10 07:05 chat_template.json\n",
+      "-rw-rw-r-- 1 prabod prabod 1.2K Dec 10 07:05 config.json\n",
+      "-rw-rw-r-- 1 prabod prabod  567 Dec 10 07:05 preprocessor_config.json\n",
+      "-rw-rw-r-- 1 prabod prabod  613 Dec 10 07:05 special_tokens_map.json\n",
+      "-rw-rw-r-- 1 prabod prabod 4.3K Dec 10 07:05 tokenizer_config.json\n",
+      "-rw-rw-r-- 1 prabod prabod  11M Dec 10 07:05 tokenizer.json\n",
+      "-rw-rw-r-- 1 prabod prabod 2.7M Dec 10 07:05 vocab.json\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls -lh {assets_dir}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1.3 Test the openvino model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openvino as ov\n",
+    "import torch\n",
+    "from pathlib import Path\n",
+    "core = ov.Core()\n",
+    "device = \"CPU\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "model_path = Path(\"/mnt/research/Projects/ModelZoo/QWEN2-VL/test/Qwen2-VL-2B-Instruct\")\n",
+    "\n",
+    "language_model = core.read_model(model_path / LANGUAGE_MODEL_NAME)\n",
+    "compiled_language_model = core.compile_model(language_model, \"CPU\")\n",
+    "request = compiled_language_model.create_infer_request()\n",
+    "\n",
+    "image_embedding = core.compile_model(model_path / IMAGE_EMBEDDING_NAME, \"CPU\")\n",
+    "image_embedding_merger = core.compile_model(model_path / IMAGE_EMBEDDING_MERGER_NAME, \"CPU\")\n",
+    "text_embedding = core.compile_model(model_path / TEXT_EMBEDDING_NAME, \"CPU\")\n",
+    "rotary_embedding = core.compile_model(model_path / ROTARY_EMBEDDING_NAME, \"CPU\")\n",
+    "patch_reshape = core.compile_model(model_path / PATCH_RESHAPE_NAME, \"CPU\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "56e7f99e52234dbc9d2f7cb0306cd668",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00 1:\n",
+    "        hidden_states = torch.from_numpy(image_embedding(pixel_values)[0])\n",
+    "        rotary_pos_emb = torch.cat([torch.from_numpy(rotary_embedding(x)[0]) for x in image_grid_thw], dim=0)\n",
+    "        grid_thw = image_grid_thw\n",
+    "        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(dim=0, dtype=torch.int32)\n",
+    "        cu_seqlens = torch.nn.functional.pad(cu_seqlens, (1, 0), value=0)\n",
+    "        attention_mask = torch.zeros((1, hidden_states.shape[0], hidden_states.shape[0]), dtype=torch.bool)\n",
+    "        causal_mask = torch.zeros_like(attention_mask, dtype=torch.float32)\n",
+    "        for i in range(1, len(cu_seqlens)):\n",
+    "            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True\n",
+    "\n",
+    "        causal_mask.masked_fill_(torch.logical_not(attention_mask), float(\"-inf\"))\n",
+    "\n",
+    "        image_embeds = torch.from_numpy(image_embedding_merger(\n",
+    "            {\n",
+    "                \"hidden_states\": hidden_states,\n",
+    "                \"rotary_pos_emb\": rotary_pos_emb,\n",
+    "                \"attention_mask\": attention_mask,\n",
+    "            }\n",
+    "        )[0])\n",
+    "        image_mask = input_ids == config.image_token_id\n",
+    "        inputs_embeds[image_mask] = image_embeds\n",
+    "    # break\n",
+    "    if i>0:\n",
+    "        inputs = {}\n",
+    "\n",
+    "    if current_input_ids.shape[-1] > 1:\n",
+    "        attention_mask = inputs_new[\"attention_mask\"]\n",
+    "        position_ids = torch.arange(current_input_ids.shape[1], device=current_input_ids.device).view(1, 1, -1).expand(3, current_input_ids.shape[0], -1)\n",
+    "    \n",
+    "    # Prepare inputs for the model\n",
+    "    inputs[\"inputs_embeds\"] = inputs_embeds\n",
+    "    inputs[\"attention_mask\"] = attention_mask\n",
+    "    inputs[\"position_ids\"] = position_ids\n",
+    "    if \"beam_idx\" in input_names:\n",
+    "        inputs[\"beam_idx\"] = np.arange(inputs_embeds.shape[0], dtype=int)\n",
+    "    \n",
+    "    # Start inference\n",
+    "    request.start_async(inputs, share_inputs=True)\n",
+    "    request.wait()\n",
+    "    \n",
+    "    # Get the logits and find the next token\n",
+    "    logits = torch.from_numpy(request.get_tensor(\"logits\").data)\n",
+    "    next_token = logits.argmax(-1)[0][-1]\n",
+    "\n",
+    "    # Append the generated token\n",
+    "    generated_tokens.append(next_token)\n",
+    "    \n",
+    "    # Update input_ids with the new token\n",
+    "    current_input_ids = torch.cat([next_token.unsqueeze(0).unsqueeze(0)], dim=-1)\n",
+    "    \n",
+    "    position_ids = torch.tensor(inputs_new[\"input_ids\"].shape[-1] + i).view(1, 1, -1).expand(3, current_input_ids.shape[0], -1)\n",
+    "\n",
+    "    inputs[\"position_ids\"] = position_ids\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Question:\n",
+      " Describe this Image\n",
+      "Answer:\n",
+      "The Bennett is sitting on the beach, wearing a plaid shirt and black pants. She is smiling and appears to be enjoying her time outdoors. A dog is sitting next to her, wearing a harness and leash. The beach is sandy and the sky\n"
+     ]
+    }
+   ],
+   "source": [
+    "output_text = processor.batch_decode(\n",
+    "    generated_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
+    ")\n",
+    "\n",
+    "print(\"Question:\\n Describe this Image\")\n",
+    "print(\"Answer:\")\n",
+    "print(\"\".join(output_text))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Import and Save Qwen2VL in Spark NLP\n",
+    "\n",
+    "- Let's install and setup Spark NLP in Google Colab\n",
+    "- This part is pretty easy via our simple script"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's start Spark with Spark NLP included via our simple `start()` function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "24/11/07 09:56:55 WARN Utils: Your hostname, minotaur resolves to a loopback address: 127.0.1.1; using 192.168.1.4 instead (on interface eno1)\n",
+      "24/11/07 09:56:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
+      "24/11/07 09:56:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting default log level to \"WARN\".\n",
+      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sparknlp\n",
+    "\n",
+    "# let's start Spark with Spark NLP\n",
+    "spark = sparknlp.start()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "24/11/07 09:57:34 WARN NativeLibrary: Failed to load library null: java.lang.UnsatisfiedLinkError: Can't load library: /tmp/openvino-native15331424460843812197/libtbb.so.2\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING: An illegal reflective access operation has occurred\n",
+      "WARNING: Illegal reflective access by org.apache.spark.util.SizeEstimator$ (file:/home/prabod/spark/jars/spark-core_2.12-3.3.2.jar) to field java.util.regex.Pattern.pattern\n",
+      "WARNING: Please consider reporting this to the maintainers of org.apache.spark.util.SizeEstimator$\n",
+      "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
+      "WARNING: All illegal access operations will be denied in a future release\n"
+     ]
+    }
+   ],
+   "source": [
+    "imageClassifier = Qwen2VLForMultiModal.pretrained() \\\n",
+    "            .setInputCols(\"image_assembler\") \\\n",
+    "            .setOutputCol(\"answer\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imageClassifier.write().overwrite().save(\"Qwen2VL_spark_nlp\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sparknlp\n",
+    "from sparknlp.base import *\n",
+    "from sparknlp.annotator import *\n",
+    "from pyspark.sql.functions import lit\n",
+    "from pyspark.ml import Pipeline\n",
+    "from pathlib import Path\n",
+    "import os\n",
+    "\n",
+    "# download two images to test into ./images folder\n",
+    "\n",
+    "url1 = \"https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11\"\n",
+    "url2 = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n",
+    "\n",
+    "Path(\"images\").mkdir(exist_ok=True)\n",
+    "\n",
+    "!wget -q -O images/image1.jpg {url1}\n",
+    "!wget -q -O images/image2.jpg {url2}\n",
+    "\n",
+    "\n",
+    "\n",
+    "images_path = \"file://\" + os.getcwd() + \"/images/\"\n",
+    "image_df = spark.read.format(\"image\").load(\n",
+    "    path=images_path\n",
+    ")\n",
+    "\n",
+    "test_df = image_df.withColumn(\"text\", lit(\"<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\\n<|im_start|>assistant\\n\"))\n",
+    "\n",
+    "image_assembler = ImageAssembler().setInputCol(\"image\").setOutputCol(\"image_assembler\")\n",
+    "\n",
+    "imageClassifier = Qwen2VLForMultiModal.load(\"Qwen2VL_spark_nlp\")\\\n",
+    "            .setMaxOutputLength(50) \\\n",
+    "            .setInputCols(\"image_assembler\") \\\n",
+    "            .setOutputCol(\"answer\")\n",
+    "\n",
+    "pipeline = Pipeline(\n",
+    "            stages=[\n",
+    "                image_assembler,\n",
+    "                imageClassifier,\n",
+    "            ]\n",
+    "        )\n",
+    "\n",
+    "model = pipeline.fit(test_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "image_path: /mnt/research/Projects/ModelZoo/LLAVA/images/image1.jpg\n",
+      "[Annotation(document, 0, 363, This image features a cat comfortably laying inside a cardboard box. The cat appears to be relaxed and enjoying its cozy spot. The scene takes place on a carpeted floor, which adds to the overall warm and inviting atmosphere of the image. The cat's position inside the box creates a sense of security and contentment, making it an endearing and heartwarming scene., Map(), [])]\n"
+     ]
+    }
+   ],
+   "source": [
+    "light_pipeline = LightPipeline(model)\n",
+    "image_path = os.getcwd() + \"/images/\" + \"image1.jpg\"\n",
+    "print(\"image_path: \" + image_path)\n",
+    "annotations_result = light_pipeline.fullAnnotateImage(\n",
+    "    image_path,\n",
+    "    \"<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\\n<|im_start|>assistant\\n\"\n",
+    ")\n",
+    "\n",
+    "for result in annotations_result:\n",
+    "    print(result[\"answer\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pth23",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}