0.5.0

matatonic · Apr 5, 2024 · 3401b8e · 3401b8e
1 parent 4827ec0
commit 3401b8e
Show file tree

Hide file tree

Showing 18 changed files with 556 additions and 89 deletions.
diff --git a/README.md b/README.md
@@ -11,28 +11,35 @@ Backend Model support:
 - [X] [LlavaNext](https://huggingface.co/llava-hf) - (llava-v1.6-mistral-7b-hf, llava-v1.6-34b-hf - llava-v1.6-34b-hf is not working well yet) *(only supports a single image)
 - [X] [Llava](https://huggingface.co/llava-hf) - (llava-v1.5-vicuna-7b-hf, llava-v1.5-vicuna-13b-hf, llava-v1.5-bakLlava-7b-hf) *(only supports a single image)
 - [X] [Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)
+- [X] [InternLM-XComposer2](https://huggingface.co/internlm/internlm-xcomposer2-7b) [finetune] (multi-image chat model, you may need to add "in English" to the first prompt.)
+- [X] [InternLM-XComposer2-VL](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b) [pretrain] *(only supports a single image)
 - [X] Moondream2 - [vikhyatk/moondream2](https://huggingface.co/vikhyatk/moondream2) *(only supports a single image)
 - [ ] Moondream1 - [vikhyatk/moondream1](https://huggingface.co/vikhyatk/moondream1)
 - [ ] Deepseek-VL - [deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)
+- [X] [openbmb/MiniCPM-V](https://huggingface.co/openbmb/MiniCPM-V) (aka. OmniLMM-3B) *(only supports a single image)
 - [ ] [openbmb/OmniLMM-12B](https://huggingface.co/openbmb/OmniLMM-12B)
 - [ ] [echo840/Monkey](https://huggingface.co/echo840/Monkey)
+- [ ] [YanweiLi/MiniGemini](https://huggingface.co/collections/YanweiLi/)
 - [ ] ...
 
 
 Some vision systems include their own OpenAI compatible API server. Also included are some pre-built images and docker-compose for them:
 - [X] [THUDM/CogVLM](https://github.com/THUDM/CogVLM) ([cogvlm-chat-hf](https://huggingface.co/THUDM/cogvlm-chat-hf), [cogagent-chat-hf](https://huggingface.co/THUDM/cogagent-chat-hf)), `docker-compose.cogvlm.yml` **Recommended for 16GB-40GB GPU**s
 - [X] [01-ai](https://huggingface.co/01-ai)/Yi-VL ([Yi-VL-6B](https://huggingface.co/01-ai/Yi-VL-6B), [Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B)), `docker-compose.yi-vl.yml`
 
-Version: 0.4.0
+Version: 0.5.0
 
 Recent updates:
+- new backend: XComposer2 (multi-image finetuned chat model)
+- new backend: XComposer2-VL (single image pretrained model)
+- new backend: MiniCPM-V aka. OmniLMM-3B
 - Yi-VL and CogVLM (docker containers only)
 - new backend: Qwen-VL
 - new backend: llava (1.5)
 - new backend: llavanext (1.6+)
 - multi-turn questions & answers
 - chat_with_images.py test tool and code sample
-- selectable chat formats (phi15, vicuna, chatml, llama2/mistral)
+- selectable chat formats
 - flash attention 2, accelerate (device split), bitsandbytes (4bit, 8bit) support
 
 
@@ -62,7 +69,7 @@ Usage
 -----
 
 ```
-usage: vision.py [-h] [-m MODEL] [-b BACKEND] [-f FORMAT] [--load-in-4bit] [--load-in-8bit] [--use-flash-attn] [-d DEVICE] [-P PORT] [-H HOST] [--preload]
+usage: vision.py [-h] [-m MODEL] [-b BACKEND] [-f FORMAT] [-d DEVICE] [--no-trust-remote-code] [-4] [-8] [-F] [-P PORT] [-H HOST] [--preload]
 
 OpenedAI Vision API Server
 
@@ -71,14 +78,16 @@ options:
   -m MODEL, --model MODEL
                         The model to use, Ex. llava-hf/llava-v1.6-mistral-7b-hf (default: vikhyatk/moondream2)
   -b BACKEND, --backend BACKEND
-                        The backend to use (moondream1, moondream2, llavanext, llava, qwen-vl) (default: moondream2)
+                        Force the backend to use (moondream1, moondream2, llavanext, llava, qwen-vl) (default: None)
   -f FORMAT, --format FORMAT
-                        Force a specific chat format. (vicuna, mistral, chatml, llama2, phi15) (default: None)
-  --load-in-4bit        load in 4bit (doesn't work with all models) (default: False)
-  --load-in-8bit        load in 8bit (doesn't work with all models) (default: False)
-  --use-flash-attn      Use Flash Attention 2 (doesn't work with all models or GPU) (default: False)
+                        Force a specific chat format. (vicuna, mistral, chatml, llama2, phi15, gemma) (doesn't work with all models) (default: None)
   -d DEVICE, --device DEVICE
                         Set the torch device for the model. Ex. cuda:1 (default: auto)
+  --no-trust-remote-code
+                        Don't trust remote code (required for some models) (default: False)
+  -4, --load-in-4bit    load in 4bit (doesn't work with all models) (default: False)
+  -8, --load-in-8bit    load in 8bit (doesn't work with all models) (default: False)
+  -F, --use-flash-attn  Use Flash Attention 2 (doesn't work with all models or GPU) (default: False)
   -P PORT, --port PORT  Server tcp port (default: 5006)
   -H HOST, --host HOST  Host to listen on, Ex. localhost (default: 0.0.0.0)
   --preload             Preload model and exit. (default: False)

diff --git a/backend/generic.py b/backend/generic.py
@@ -0,0 +1,27 @@
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+from vision_qna import *
+
+class VisionQnA(VisionQnABase):
+    model_name: str = "generic"
+
+    def __init__(self, model_id: str, device: str, extra_params = {}, format = None):
+        super().__init__(model_id, device, extra_params, format)
+
+        if not format:
+            self.format = guess_model_format(model_id)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False))
+        self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval()
+
+        print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
+
+    async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
+        images, prompt = await prompt_from_messages(messages, self.format)
+
+        encoded_images = self.model.encode_image(images)
+        inputs = self.tokenizer(prompt, encoded_images, return_tensors="pt")
+        output = self.model.generate(**inputs, max_new_tokens=max_tokens)
+        response = self.tokenizer.decode(output[0], skip_special_tokens=True)
+
+        return answer_from_response(response, self.format)
diff --git a/backend/llava.py b/backend/llava.py
@@ -1,6 +1,7 @@
 from transformers import LlavaProcessor, LlavaForConditionalGeneration
 from vision_qna import *
 
+#
 # llava-hf/bakLlava-v1-hf # llama2
 # llava-hf/llava-1.5-7b-hf # vicuna
 # llava-hf/llava-1.5-13b-hf # vicuna
@@ -13,16 +14,10 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
         super().__init__(model_id, device, extra_params, format)
 
         if not format:
-            # guess the format based on model id
-            if 'mistral' in model_id.lower():
-                self.format = 'llama2'
-            elif 'bakllava' in model_id.lower():
-                self.format = 'llama2'
-            elif 'vicuna' in model_id.lower():
-                self.format = 'vicuna'
+            self.format = guess_model_format(model_id)
 
         self.processor = LlavaProcessor.from_pretrained(model_id)
-        self.model = LlavaForConditionalGeneration.from_pretrained(**self.params)
+        self.model = LlavaForConditionalGeneration.from_pretrained(**self.params).eval()
 
         print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
 
@@ -32,17 +27,6 @@ async def chat_with_images(self, messages: list[Message], max_tokens: int) -> st
         inputs = self.processor(prompt, images, return_tensors="pt").to(self.device)
 
         output = self.model.generate(**inputs, max_new_tokens=max_tokens)
-        answer = self.processor.decode(output[0], skip_special_tokens=True)
+        response = self.processor.decode(output[0], skip_special_tokens=True)
 
-        if self.format in ['llama2', 'mistral']:
-            idx = answer.rfind('[/INST]') + len('[/INST]') + 1 #+ len(images)
-            return answer[idx:]
-        elif self.format == 'vicuna':
-            idx = answer.rfind('ASSISTANT:') + len('ASSISTANT:') + 1 #+ len(images)
-            return answer[idx:]
-        elif self.format == 'chatml':
-            idx = answer.rfind('<|im_user|>assistant\n') + len('<|im_user|>assistant\n') + 1 #+ len(images)
-            end_idx = answer.rfind('<|im_end|>')
-            return answer[idx:end_idx]
-
-        return answer
+        return answer_from_response(response, self.format)
diff --git a/backend/llavanext.py b/backend/llavanext.py
@@ -14,15 +14,10 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
         super().__init__(model_id, device, extra_params, format)
 
         if not format:
-            if 'mistral' in model_id:
-                self.format = 'llama2'
-            elif 'vicuna' in model_id:
-                self.format = 'vicuna'
-            elif 'v1.6-34b' in model_id:
-                self.format = 'chatml'
+            self.format = guess_model_format(model_id)
 
         self.processor = LlavaNextProcessor.from_pretrained(model_id)
-        self.model = LlavaNextForConditionalGeneration.from_pretrained(**self.params)
+        self.model = LlavaNextForConditionalGeneration.from_pretrained(**self.params).eval()
 
         print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
 
@@ -32,19 +27,6 @@ async def chat_with_images(self, messages: list[Message], max_tokens: int) -> st
         inputs = self.processor(prompt, images, return_tensors="pt").to(self.model.device)
 
         output = self.model.generate(**inputs, max_new_tokens=max_tokens)
-        answer = self.processor.decode(output[0], skip_special_tokens=True)
+        response = self.processor.decode(output[0], skip_special_tokens=True)
 
-        if self.format in ['llama2', 'mistral']:
-            idx = answer.rfind('[/INST]') + len('[/INST]') + 1 #+ len(images)
-            return answer[idx:]
-        elif self.format == 'vicuna':
-            idx = answer.rfind('ASSISTANT:') + len('ASSISTANT:') + 1 #+ len(images)
-            return answer[idx:]
-        elif self.format == 'chatml':
-            # XXX This is broken with the 34b, extra spaces in the tokenizer
-            # XXX You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
-            idx = answer.rfind('<|im_start|>assistant\n') + len('<|im_start|>assistant\n') + 1 #+ len(images)
-            end_idx = answer.rfind('<|im_end|>')
-            return answer[idx:end_idx]
-
-        return answer
+        return answer_from_response(response, self.format)
diff --git a/backend/minigemini.py b/backend/minigemini.py
@@ -0,0 +1,55 @@
+import re
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+from minigemini.model.builder import load_pretrained_model
+from minigemini.mm_utils import process_images
+
+from vision_qna import *
+
+class VisionQnA(VisionQnABase):
+    model_name: str = "minigemini"
+    format: str = "llama2"
+
+    def __init__(self, model_id: str, device: str, extra_params = {}, format = None):
+        super().__init__(model_id, device, extra_params, format)
+
+        if not format:
+            self.format = guess_model_format(model_id)
+
+        self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
+            args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device)
+
+
+        print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
+
+    async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
+        images, prompt = await prompt_from_messages(messages, self.format)
+
+        #encoded_images = self.model.encode_image(images).to(self.device)
+        # square?
+        image_tensor = process_images(image_convert, image_processor, model.config)
+        image_processor(images, return_tensors='pt')['pixel_values']
+
+        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.model.device)
+
+        with torch.inference_mode():
+            output_ids = self.model.generate(
+                input_ids,
+                images=image_tensor,
+                images_aux=None,
+                do_sample=False,
+                temperature=0.0,
+                max_new_tokens=max_tokens,
+                bos_token_id=self.tokenizer.bos_token_id,  # Begin of sequence token
+                eos_token_id=self.tokenizer.eos_token_id,  # End of sequence token
+                pad_token_id=self.tokenizer.pad_token_id,  # Pad token
+                use_cache=True)
+
+        answer = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+        self.
+
+        return answer
+
+
+
diff --git a/backend/monkey.py b/backend/monkey.py
@@ -0,0 +1,70 @@
+import os
+import uuid
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+from vision_qna import *
+
+# echo840/Monkey
+
+class VisionQnA(VisionQnABase):
+    model_name: str = "monkey"
+    format: str = 'qwen' # phi15-ish
+
+    def __init__(self, model_id: str, device: str, extra_params = {}, format = None):
+        super().__init__(model_id, device, extra_params, format)
+
+         # XXX currently bugged https://huggingface.co/echo840/Monkey/discussions/4
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=self.params.get('trust_remote_code', False))
+        self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval()
+
+        self.tokenizer.padding_side = 'left'
+        self.tokenizer.pad_token_id = self.tokenizer.eod_id
+
+        print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
+
+    async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
+        files = []
+        prompt = ''
+
+        for m in messages:
+            if m.role == 'user':
+                p = ''
+                for c in m.content:
+                    if c.type == 'image_url':
+                        filename = await url_to_file(c.image_url.url)
+                        p = '<img>' + filename + '</img> ' + p
+                    if c.type == 'text':
+                        p += f"{c.text}\n\n" # Question:
+                prompt += p
+            elif m.role == 'assistant':
+                for c in m.content:
+                    if c.type == 'text':
+                        prompt += f"Answer: {c.text}\n\n"
+
+        prompt += "Answer:"
+
+        input_ids = self.tokenizer(prompt, return_tensors='pt', padding='longest')
+
+        attention_mask = input_ids.attention_mask.to(self.model.device)
+        input_ids = input_ids.input_ids.to(self.model.device)
+
+        pred = self.model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=512,
+            min_new_tokens=1,
+            length_penalty=1,
+            num_return_sequences=1,
+            output_hidden_states=True,
+            use_cache=True,
+            pad_token_id=self.tokenizer.eod_id,
+            eos_token_id=self.tokenizer.eod_id,
+        )
+        response = self.tokenizer.decode(pred[0][input_ids.size(1):].cpu(), skip_special_tokens=True).strip()
+
+        for f in files:
+            os.remove(f)
+
+        return response
diff --git a/backend/moondream1.py b/backend/moondream1.py
@@ -14,17 +14,17 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
         del self.params['device_map']
 
         self.tokenizer = CodeGenTokenizerFast.from_pretrained(model_id)
-        self.model = AutoModelForCausalLM.from_pretrained(**self.params, trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval()
 
         # bitsandbytes already moves the model to the device, so we don't need to do it again.
         if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)):
-           self.model.to(self.device)
+            self.model = self.model.to(self.device)
 
         print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
 
     async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
         images, prompt = await prompt_from_messages(messages, self.format)
-        encoded_images = self.model.encode_image(images[0]).to(self.device)
+        encoded_images = self.model.encode_image(images[0]).to(self.model.device)
 
         # XXX currently broken here... 
         """

diff --git a/backend/moondream2.py b/backend/moondream2.py
@@ -3,6 +3,8 @@
 
 from vision_qna import *
 
+# vikhyatk/moondream2
+
 class VisionQnA(VisionQnABase):
     model_name: str = "moondream2"
     revision: str = '2024-03-13' # 'main'
@@ -15,16 +17,16 @@ def __init__(self, model_id: str, device: str, extra_params = {}, format = None)
         del self.params['device_map']
 
         self.tokenizer = AutoTokenizer.from_pretrained(model_id)
-        self.model = AutoModelForCausalLM.from_pretrained(**self.params, trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(**self.params).eval()
 
 #        # bitsandbytes already moves the model to the device, so we don't need to do it again.
         if not (extra_params.get('load_in_4bit', False) or extra_params.get('load_in_8bit', False)):
-           self.model.to(self.device)
+           self.model = self.model.to(self.device)
 
         print(f"Loaded on device: {self.model.device} with dtype: {self.model.dtype}")
 
     async def chat_with_images(self, messages: list[Message], max_tokens: int) -> str:
-        images, prompt = await prompt_from_messages(messages, self.format)
+        images, prompt = await phi15_prompt_from_messages(messages)
 
         encoded_images = self.model.encode_image(images).to(self.device)