Skip to content

Commit

Permalink
wip minicpmv
Browse files Browse the repository at this point in the history
  • Loading branch information
ngxson committed Jan 19, 2025
1 parent d0068ef commit 4a7ab89
Show file tree
Hide file tree
Showing 9 changed files with 491 additions and 77 deletions.
160 changes: 113 additions & 47 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
from itertools import chain

from transformers import AutoConfig, AutoImageProcessor
from transformers import AutoConfig
import math
import numpy as np
import torch
Expand Down Expand Up @@ -134,6 +134,16 @@ def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
return None
raise KeyError(f"could not find any of: {keys}")

def find_vparams(self, keys: Iterable[str], optional: bool = False) -> Any:
if self.vparams is None:
raise ValueError("vision model parameters not set")
key = next((k for k in keys if k in self.vparams), None)
if key is not None:
return self.vparams[key]
if optional:
return None
raise KeyError(f"(vision) could not find any of: {keys}")

def set_vocab(self):
self._set_vocab_gpt2()

Expand Down Expand Up @@ -269,6 +279,20 @@ def set_gguf_parameters(self):
self.gguf_writer.add_key_length(head_dim)
self.gguf_writer.add_value_length(head_dim)

# Vision model parameters
if self.vparams is not None and self.preprocessor_config is not None and self.vision_arch is not None:
self.gguf_writer.add_vision_type("clip-vit")
self.gguf_writer.add_vision_image_size(self.vparams["image_size"])
self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"])
self.gguf_writer.add_vision_clip_architecture(gguf.MODEL_ARCH_NAMES[self.vision_arch])
self.gguf_writer.add_vision_clip_block_count(self.vparams["num_hidden_layers"])
self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"])
self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"])
self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"])
self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"])
self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"])
self.gguf_writer.add_vision_clip_select_layer(self.find_hparam(["vision_feature_layer", "mm_vision_select_layer"]))

self.gguf_writer.add_file_type(self.ftype)
logger.info(f"gguf: file type = {self.ftype}")

Expand Down Expand Up @@ -488,17 +512,14 @@ def load_hparams(dir_model: Path):
return hparams

@staticmethod
def load_preprocessor_config(dir_or_model_id: Path | str):
def load_preprocessor_config(dir_model: Path):
# TODO: this varies vastly among models, need to handle more cases in the future
if isinstance(dir_or_model_id, Path):
file_path = dir_or_model_id / "preprocessor_config.json"
if os.path.exists(file_path):
with open(file_path, "r", encoding="utf-8") as f:
return json.load(f)
else:
raise Exception(f"Preprocessor config not found at {file_path}")
file_path = dir_model / "preprocessor_config.json"
if os.path.exists(file_path):
with open(file_path, "r", encoding="utf-8") as f:
return json.load(f)
else:
return AutoImageProcessor.from_pretrained(dir_or_model_id).to_dict()
raise Exception(f"Preprocessor config not found at {file_path}")

@classmethod
def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
Expand Down Expand Up @@ -551,7 +572,9 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
toktypes: list[int] = []

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
# DEBIAN_FRONTEND=noninteractive means that the script is running in a non-interactive environment (i.e. CI), so we cannot answer Y/N when it asks for user input
is_cli_non_interactive = os.environ.get("DEBIAN_FRONTEND", "") == "noninteractive"
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=is_cli_non_interactive)
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size

Expand Down Expand Up @@ -1607,9 +1630,10 @@ def __init__(self, *args, **kwargs):

# only tested with https://huggingface.co/mtgv/MobileVLM_V2-1.7B
if "mm_vision_tower" in self.hparams and model_type == "mobilevlm":
from transformers import AutoImageProcessor
vision_model_id = self.hparams["mm_vision_tower"]
self.vparams = AutoConfig.from_pretrained(vision_model_id).to_dict()["vision_config"]
self.preprocessor_config = self.load_preprocessor_config(vision_model_id)
self.preprocessor_config = AutoImageProcessor.from_pretrained(vision_model_id).to_dict()
self.vision_arch = gguf.MODEL_ARCH.VISION_MOBILEVLM

if self.vparams is not None and self.vision_arch is not None:
Expand Down Expand Up @@ -1648,34 +1672,6 @@ def set_vocab(self):
if self.hparams.get("vocab_size", 32000) == 49152:
self.gguf_writer.add_add_bos_token(False)

# For vision model
if self.vparams is not None and self.preprocessor_config is not None and self.vision_arch is not None:
self.gguf_writer.add_vision_type("clip-vit")
self.gguf_writer.add_vision_image_size(self.vparams["image_size"])
self.gguf_writer.add_vision_patch_size(self.vparams["patch_size"])
self.gguf_writer.add_vision_clip_architecture(gguf.MODEL_ARCH_NAMES[self.vision_arch])
self.gguf_writer.add_vision_clip_block_count(self.vparams["num_hidden_layers"])
self.gguf_writer.add_vision_clip_embedding_length(self.vparams["hidden_size"])
self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"])
self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"])
self.gguf_writer.add_vision_clip_image_mean(self.preprocessor_config["image_mean"])
self.gguf_writer.add_vision_clip_image_std(self.preprocessor_config["image_std"])
self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd)
if "vision_feature_layer" in self.hparams:
self.gguf_writer.add_vision_clip_select_layer(self.hparams["vision_feature_layer"])
elif "mm_vision_select_layer" in self.hparams:
self.gguf_writer.add_vision_clip_select_layer(self.hparams["mm_vision_select_layer"])
else:
raise ValueError("gguf: can not find vision_feature_layer parameter.")
# TODO: should not hardcode these, but they are currently missing from config.json
if self.vision_arch == gguf.MODEL_ARCH.VISION_LLAVA:
self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP)
if self.vision_arch == gguf.MODEL_ARCH.VISION_MOBILEVLM:
self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.LDPV2)
self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05)

def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
Expand All @@ -1692,6 +1688,18 @@ def set_gguf_parameters(self):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])

# For vision model
if self.vparams is not None:
self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
# TODO: should not hardcode these, but they are currently missing from config.json
if self.vision_arch == gguf.MODEL_ARCH.VISION_LLAVA:
self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP)
if self.vision_arch == gguf.MODEL_ARCH.VISION_MOBILEVLM:
self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.LDPV2)
self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05)
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2 + 1
self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd)

@staticmethod
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
if n_head_kv is not None and n_head != n_head_kv:
Expand Down Expand Up @@ -2132,23 +2140,66 @@ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims:
@Model.register("MiniCPMForCausalLM", "MiniCPMV")
class MiniCPMModel(Model):
model_arch = gguf.MODEL_ARCH.MINICPM
proj_type: gguf.constants.CLIPProjectorType | None

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

model_type = self.hparams.get("model_type", None)

# only tested with https://huggingface.co/openbmb/MiniCPM-V-2_6
if "vision_config" in self.hparams and model_type == "minicpmv":
self.vparams = self.hparams["vision_config"]
self.preprocessor_config = self.load_preprocessor_config(self.dir_model)
self.vision_arch = gguf.MODEL_ARCH.VISION_MINICPMV
version = str(self.hparams.get("version", "unknown"))
if version == "2.5":
self.proj_type = gguf.constants.CLIPProjectorType.MINICPMV_2_5
elif version == "2.6":
self.proj_type = gguf.constants.CLIPProjectorType.MINICPMV_2_6
else:
raise ValueError(f"Unsupported MiniCPM-V version: {version}")

if self.vparams is not None and self.vision_arch is not None and self.preprocessor_config is not None:
self.preprocessor_config["image_mean"] = [0.5, 0.5, 0.5]
self.preprocessor_config["image_std"] = [0.5, 0.5, 0.5]
self.hparams["vision_feature_layer"] = 0
self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"])

def set_gguf_parameters(self):
super().set_gguf_parameters()
embedding_scale = float(self.hparams["scale_emb"])
# scale_emb
embedding_scale = float(self.hparams.get("scale_emb", 1.0))
self.gguf_writer.add_embedding_scale(embedding_scale)
logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
# scale_depth
if "scale_depth" in self.hparams:
residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
else:
residual_scale = 1.0
self.gguf_writer.add_residual_scale(residual_scale)
logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
# logit_scale
if "dim_model_base" in self.hparams:
logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
else:
logit_scale = 1.0
self.gguf_writer.add_logit_scale(logit_scale)
logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
if self.hparams.get("rope_scaling") is not None:
if self.hparams["rope_scaling"].get("type") == "longrope":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")

# For vision model
if self.vparams is not None and self.proj_type is not None:
self.gguf_writer.add_vision_clip_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
self.gguf_writer.add_vision_clip_projector_type(self.proj_type)
self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-06)
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2
self.gguf_writer.add_vision_clip_max_position_embeddings(max_pos_embd)


def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]

Expand All @@ -2167,18 +2218,33 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))

def set_vocab(self):
self._set_vocab_sentencepiece()
if self.vision_arch == gguf.MODEL_ARCH.VISION_MINICPMV:
# undocumented anywhere, I only found this thanks to https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf
self._set_vocab_gpt2()
else:
self._set_vocab_sentencepiece()

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused

# For vision model
if name.startswith("llm."):
name = name.replace("llm.", "")
# attention, someone mess up and use underscore instead of dot
if name.endswith("in_proj_weight"):
name = name.replace("_weight", ".weight")
if name.endswith("in_proj_bias"):
name = name.replace("_bias", ".bias")
if "post_layernorm" in name:
return [] # skip post_layernorm

n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")

# HF models permute some of the tensors, so we need to undo that
if name.endswith(("q_proj.weight")):
if not name.startswith("vpm") and name.endswith(("q_proj.weight")):
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight")):
if not name.startswith("vpm") and name.endswith(("k_proj.weight")):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)

return [(self.map_tensor_name(name), data_torch)]
Expand Down Expand Up @@ -5064,7 +5130,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):

def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Convert a huggingface model to a GGML compatible file")
description="Convert a huggingface model to a GGML compatible file\n\nNote: When converting vision models, this script may use internet connection to download configuration files via Hugging Face.")
parser.add_argument(
"--vocab-only", action="store_true",
help="extract only the vocab",
Expand Down
46 changes: 44 additions & 2 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,7 @@ class MODEL_ARCH(IntEnum):
# vision models
VISION_LLAVA = auto()
VISION_MOBILEVLM = auto()
VISION_MINICPMV = auto()


class MODEL_TENSOR(IntEnum):
Expand Down Expand Up @@ -455,6 +456,15 @@ class MODEL_TENSOR(IntEnum):
V_ENC_FFN_DOWN = auto()
V_PRE_NORM = auto()
V_POST_NORM = auto()
V_RESMPL_POS_EMBD_K = auto() # minicpmv
V_RESMPL_ATTN_IN = auto() # minicpmv
V_RESMPL_ATTN_OUT = auto() # minicpmv
V_RESMPL_KV_PROJ = auto() # minicpmv
V_RESMPL_NORM_POST = auto() # minicpmv
V_RESMPL_NORM_KV = auto() # minicpmv
V_RESMPL_NORM_Q = auto() # minicpmv
V_RESMPL_PROJ = auto() # minicpmv
V_RESMPL_QUERY = auto() # minicpmv


MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
Expand Down Expand Up @@ -518,6 +528,7 @@ class MODEL_TENSOR(IntEnum):
# vision
MODEL_ARCH.VISION_LLAVA: "llava",
MODEL_ARCH.VISION_MOBILEVLM: "mobilevlm",
MODEL_ARCH.VISION_MINICPMV: "minicpmv",
}

TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
Expand Down Expand Up @@ -662,6 +673,15 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.enc.blk.{bid}.ffn_down",
MODEL_TENSOR.V_PRE_NORM: "v.pre_norm",
MODEL_TENSOR.V_POST_NORM: "v.post_norm",
MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "v.resmpl.pos_embd_k",
MODEL_TENSOR.V_RESMPL_ATTN_IN: "v.resmpl.attn_in",
MODEL_TENSOR.V_RESMPL_ATTN_OUT: "v.resmpl.attn_out",
MODEL_TENSOR.V_RESMPL_KV_PROJ: "v.resmpl.kv_proj",
MODEL_TENSOR.V_RESMPL_NORM_POST: "v.resmpl.norm_post",
MODEL_TENSOR.V_RESMPL_NORM_KV: "v.resmpl.norm_kv",
MODEL_TENSOR.V_RESMPL_NORM_Q: "v.resmpl.norm_q",
MODEL_TENSOR.V_RESMPL_PROJ: "v.resmpl.proj",
MODEL_TENSOR.V_RESMPL_QUERY: "v.resmpl.query",
}

MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
Expand Down Expand Up @@ -1636,6 +1656,26 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_PRE_NORM,
MODEL_TENSOR.V_POST_NORM,
],
MODEL_ARCH.VISION_MINICPMV: [
MODEL_TENSOR.V_ENC_EMBD_PATCH,
MODEL_TENSOR.V_ENC_EMBD_POS,
MODEL_TENSOR.V_ENC_ATTN_Q,
MODEL_TENSOR.V_ENC_ATTN_K,
MODEL_TENSOR.V_ENC_ATTN_V,
MODEL_TENSOR.V_ENC_INPUT_NORM,
MODEL_TENSOR.V_ENC_OUTPUT,
MODEL_TENSOR.V_ENC_OUTPUT_NORM,
MODEL_TENSOR.V_ENC_FFN_UP,
MODEL_TENSOR.V_ENC_FFN_DOWN,
MODEL_TENSOR.V_RESMPL_ATTN_IN,
MODEL_TENSOR.V_RESMPL_ATTN_OUT,
MODEL_TENSOR.V_RESMPL_KV_PROJ,
MODEL_TENSOR.V_RESMPL_NORM_POST,
MODEL_TENSOR.V_RESMPL_NORM_KV,
MODEL_TENSOR.V_RESMPL_NORM_Q,
MODEL_TENSOR.V_RESMPL_PROJ,
MODEL_TENSOR.V_RESMPL_QUERY,
],
# TODO
}

Expand Down Expand Up @@ -1718,8 +1758,10 @@ class PoolingType(IntEnum):


class CLIPProjectorType(Enum):
MLP = 'mlp'
LDPV2 = 'ldpv2'
MLP = 'mlp'
LDPV2 = 'ldpv2'
MINICPMV_2_5 = 'minicpmv-2.5' # resampler
MINICPMV_2_6 = 'minicpmv-2.6' # resampler


class CLIPPatchMergeType(Enum):
Expand Down
Loading

0 comments on commit 4a7ab89

Please sign in to comment.