From 5688e58ca2797a34bd56e75c045d41be6aca1e2b Mon Sep 17 00:00:00 2001 From: zengwangding Date: Tue, 7 May 2024 16:23:16 +0800 Subject: [PATCH 01/17] support deepseek_v2 --- vllm/config.py | 7 + .../layers/fused_moe/fused_moe.py | 57 ++- .../model_executor/layers/rotary_embedding.py | 125 +++++ vllm/model_executor/models/__init__.py | 3 + vllm/model_executor/models/deepseek_v2.py | 484 ++++++++++++++++++ 5 files changed, 654 insertions(+), 22 deletions(-) create mode 100644 vllm/model_executor/models/deepseek_v2.py diff --git a/vllm/config.py b/vllm/config.py index 5c3a8615eefb4..19847192d8afa 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -250,6 +250,9 @@ def get_hidden_size(self) -> int: return self.hf_text_config.hidden_size def get_head_size(self) -> int: + if hasattr(self.hf_text_config, "model_type") and self.hf_text_config.model_type=='deepseek_v2': + # FlashAttention suports only 32, 64, 128, 256, we need pading 192 to 256 + return 256 if hasattr(self.hf_text_config, "head_dim"): return self.hf_text_config.head_dim # FIXME(woosuk): This may not be true for all models. @@ -262,6 +265,8 @@ def get_total_num_kv_heads(self) -> int: # NOTE: for falcon, when new_decoder_architecture is True, the # multi_query flag is ignored and we use n_head_kv for the number of # KV heads. + if hasattr(self.hf_text_config, "model_type") and self.hf_text_config.model_type=='deepseek_v2': + return self.hf_text_config.num_attention_heads falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"] new_decoder_arch_falcon = ( self.hf_config.model_type in falcon_model_types @@ -307,6 +312,8 @@ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int: def get_num_attention_heads(self, parallel_config: "ParallelConfig") -> int: + if hasattr(self.hf_text_config, "model_type") and self.hf_text_config.model_type=='deepseek_v2': + return self.hf_text_config.num_attention_heads return self.hf_text_config.num_attention_heads // \ parallel_config.tensor_parallel_size diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 3cb0419404625..ac1dc60b4650d 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -322,6 +322,8 @@ def fused_moe( w2_scale: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, + num_expert_group: int = 0, + topk_group: int = 0, ) -> torch.Tensor: """ This function computes a Mixture of Experts (MoE) layer using two sets of @@ -362,7 +364,6 @@ def fused_moe( ] M, _ = hidden_states.shape E, N, _ = w1.shape - if is_hip(): # The MoE kernels are not yet supported on ROCm. routing_weights = torch.softmax(gating_output, @@ -370,27 +371,39 @@ def fused_moe( dtype=torch.float32) topk_weights, topk_ids = torch.topk(routing_weights, topk, dim=-1) else: - import vllm._moe_C as moe_kernels - - topk_weights = torch.empty(M, - topk, - dtype=torch.float32, - device=hidden_states.device) - topk_ids = torch.empty(M, - topk, - dtype=torch.int32, - device=hidden_states.device) - token_expert_indicies = torch.empty(M, - topk, - dtype=torch.int32, - device=hidden_states.device) - moe_kernels.topk_softmax( - topk_weights, - topk_ids, - token_expert_indicies, - gating_output.float(), # TODO(woosuk): Optimize this. - ) - del token_expert_indicies # Not used. Will be used in the future. + if num_expert_group == 0: + import vllm._moe_C as moe_kernels + + topk_weights = torch.empty(M, + topk, + dtype=torch.float32, + device=hidden_states.device) + topk_ids = torch.empty(M, + topk, + dtype=torch.int32, + device=hidden_states.device) + token_expert_indicies = torch.empty(M, + topk, + dtype=torch.int32, + device=hidden_states.device) + moe_kernels.topk_softmax( + topk_weights, + topk_ids, + token_expert_indicies, + gating_output.float(), # TODO(woosuk): Optimize this. + ) + del token_expert_indicies # Not used. Will be used in the future. + else: + scores = torch.softmax(gating_output, dim = -1) + num_token = scores.shape[0] + group_scores = scores.view(num_token, num_expert_group, -1).max(dim=-1).values # [n, n_group] + group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[1] # [n, top_k_group] + group_mask = torch.zeros_like(group_scores) # [n, n_group] + group_mask.scatter_(1, group_idx, 1) # [n, n_group] + score_mask = group_mask.unsqueeze(-1).expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group).reshape(num_token, -1) # [n, e] + tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e] + topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False) + if renormalize: topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 857d70fadcb57..fd671c1f47a75 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -452,6 +452,116 @@ def forward( return query.flatten(-2), key.flatten(-2) +def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 + + +class DeepseekScalingRotaryEmbedding(RotaryEmbedding): + """RotaryEmbedding extended with YaRN method. + + Credits to Peng et al. github.com/jquesnelle/yarn + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + scaling_factor: float, + *, + extrapolation_factor: float = 1, + attn_factor: float = 1, + beta_fast: float = 32, + beta_slow: float = 1, + mscale: float = 1, + mscale_all_dim: float = 0, + ) -> None: + self.scaling_factor = scaling_factor + self.extrapolation_factor = extrapolation_factor + self.attn_factor = attn_factor + self.beta_fast = beta_fast + self.beta_slow = beta_slow + # Get n-d magnitude scaling corrected for interpolation. + self.mscale = float( + yarn_get_mscale(self.scaling_factor, float(mscale)) + / yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) * attn_factor) + super().__init__(head_size, rotary_dim, max_position_embeddings, base, + is_neox_style) + + def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor: + pos_freqs = self.base**(torch.arange( + 0, self.rotary_dim, 2, dtype=torch.float, device="cuda") / + self.rotary_dim) + inv_freq_extrapolation = 1.0 / pos_freqs + inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs) + + low, high = _yarn_find_correction_range(self.beta_fast, self.beta_slow, + self.rotary_dim, self.base, + self.max_position_embeddings) + # Get n-d rotational scaling corrected for extrapolation + inv_freq_mask = (1 - _yarn_linear_ramp_mask( + low, high, self.rotary_dim // 2, dtype=torch.float)) * self.extrapolation_factor + inv_freq = inv_freq_interpolation * ( + 1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask + return inv_freq + + def _compute_cos_sin_cache(self) -> torch.Tensor: + inv_freq = self._compute_inv_freq(self.scaling_factor) + t = torch.arange(self.max_position_embeddings * self.scaling_factor, + device="cuda", + dtype=torch.float32) + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = (freqs.cos() * self.mscale) + sin = (freqs.sin() * self.mscale) + cache = torch.cat((cos, sin), dim=-1) + print("Cache shape", cache.shape) + return cache + + + def forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """PyTorch-native implementation equivalent to forward().""" + query_rot = query[..., :self.rotary_dim] + key_rot = key[..., :self.rotary_dim] + if self.rotary_dim < self.head_size: + query_pass = query[..., self.rotary_dim:] + key_pass = key[..., self.rotary_dim:] + + self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to( + positions.device) + cos_sin = self.cos_sin_cache[torch.add(positions, offsets) + if offsets is not None else positions] + cos, sin = cos_sin.chunk(2, dim=-1) + if self.is_neox_style: + # NOTE(woosuk): Here we assume that the positions tensor has the + # shape [batch_size, seq_len]. + cos = cos.repeat(1, 1, 2).unsqueeze(-2) + sin = sin.repeat(1, 1, 2).unsqueeze(-2) + else: + cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2) + sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2) + + rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj + query_rot = query_rot * cos + rotate_fn(query_rot) * sin + key_rot = key_rot * cos + rotate_fn(key_rot) * sin + + if self.rotary_dim < self.head_size: + query = torch.cat((query_rot, query_pass), dim=-1) + key = torch.cat((key_rot, key_pass), dim=-1) + else: + query = query_rot + key = key_rot + return query, key + _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {} @@ -506,6 +616,21 @@ def get_rope( base, is_neox_style, scaling_factor, **extra_kwargs) + elif scaling_type == "deepseek_yarn": + original_max_position = rope_scaling[ + "original_max_position_embeddings"] + # assert max_position == original_max_position * scaling_factor + extra_kwargs = { + k: v + for k, v in rope_scaling.items() + if k in ("extrapolation_factor", "attn_factor", "beta_fast", + "beta_slow", "mscale", "mscale_all_dim") + } + rotary_emb = DeepseekScalingRotaryEmbedding(head_size, rotary_dim, + original_max_position, + base, is_neox_style, + scaling_factor, + **extra_kwargs) elif scaling_type == "su": short_factor = rope_scaling["short_factor"] long_factor = rope_scaling["long_factor"] diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index c5cdc059473b3..c5c42f1515ed8 100755 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -20,7 +20,10 @@ "CohereForCausalLM": ("commandr", "CohereForCausalLM"), "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"), "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), + "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"), + "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"), + "FalconForCausalLM": ("falcon", "FalconForCausalLM"), "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"), "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"), diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py new file mode 100644 index 0000000000000..e59e0bcf2f2ca --- /dev/null +++ b/vllm/model_executor/models/deepseek_v2.py @@ -0,0 +1,484 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only DeepseekV2 model.""" +from typing import Any, Dict, Iterable, List, Optional, Tuple + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm.attention import Attention, AttentionMetadata +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce) +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + ReplicatedLinear, + RowParallelLinear, + ColumnParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import SamplerOutput + + +class DeepseekV2MLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + reduce_results: bool = True, + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config) + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=reduce_results) + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class DeepseekV2MoE(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.config = config + self.rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() + self.n_routed_experts = config.n_routed_experts + self.top_k = config.num_experts_per_tok + self.routed_scaling_factor = config.routed_scaling_factor + if self.tp_size > self.n_routed_experts: + raise ValueError( + f"Tensor parallel size {self.tp_size} is greater than " + f"the number of experts {self.n_routed_experts}.") + + self.experts = nn.ModuleList([ + DeepseekV2MLP(hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=False) + for idx in range(self.n_routed_experts) + ]) + self.pack_params() + + self.gate = ReplicatedLinear(config.hidden_size, + self.n_routed_experts, + bias=False, + quant_config=None) + + if config.n_shared_experts is not None: + intermediate_size = (config.moe_intermediate_size * + config.n_shared_experts) + self.shared_experts = DeepseekV2MLP( + hidden_size=config.hidden_size, + intermediate_size=intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=False, + ) + + def pack_params(self): + w1 = [] + w2 = [] + for expert in self.experts: + w1.append(expert.gate_up_proj.weight) + w2.append(expert.down_proj.weight) + self.w1 = torch._utils._flatten_dense_tensors(w1) + w1s = torch._utils._unflatten_dense_tensors(self.w1, w1) + for data, param in zip(w1s, w1): + param.data = data + self.w1 = self.w1.view(len(w1), *w1s[0].shape) + + self.w2 = torch._utils._flatten_dense_tensors(w2) + w2s = torch._utils._unflatten_dense_tensors(self.w2, w2) + for data, param in zip(w2s, w2): + param.data = data + + self.w2 = self.w2.view(len(w2), *w2s[0].shape) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + num_tokens, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + if self.config.n_shared_experts is not None: + shared_output = self.shared_experts(hidden_states) + # router_logits: (num_tokens, n_experts) + router_logits, _ = self.gate(hidden_states) + final_hidden_states = fused_moe(hidden_states, + self.w1, + self.w2, + router_logits, + self.top_k, + renormalize=self.config.norm_topk_prob, + inplace=True, + num_expert_group = self.config.n_group, + topk_group=self.config.topk_group) * self.routed_scaling_factor + if self.config.n_shared_experts is not None: + final_hidden_states = final_hidden_states + shared_output + final_hidden_states = tensor_model_parallel_all_reduce( + final_hidden_states) + + return final_hidden_states.view(num_tokens, hidden_dim) + +def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: + import math + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 + +class DeepseekV2Attention(nn.Module): + def __init__( + self, + config: PretrainedConfig, + hidden_size: int, num_heads: int, + qk_nope_head_dim: int, qk_rope_head_dim: int, v_head_dim: int, + q_lora_rank: int, kv_lora_rank: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + max_position_embeddings: int = 8192, + quant_config: Optional[QuantizationConfig] = None, + layer_idx = None, + ) -> None: + super().__init__() + self.layer_idx = layer_idx + self.hidden_size = hidden_size + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim + self.v_head_dim = v_head_dim + self.q_lora_rank = q_lora_rank + self.kv_lora_rank = kv_lora_rank + self.num_heads = num_heads + tp_size = get_tensor_model_parallel_world_size() + assert num_heads % tp_size == 0 + self.num_local_heads = num_heads // tp_size + self.scaling = self.qk_head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.q_a_proj = ReplicatedLinear( + self.hidden_size, self.q_lora_rank, + bias=False, quant_config=quant_config + ) + self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps) + self.q_b_proj = ColumnParallelLinear( + q_lora_rank, self.num_heads * self.qk_head_dim, + bias=False, quant_config=quant_config + ) + self.kv_a_proj_with_mqa = ReplicatedLinear( + self.hidden_size, self.kv_lora_rank + self.qk_rope_head_dim, + bias=False, quant_config=quant_config + ) + self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps) + self.kv_b_proj = ColumnParallelLinear( + self.kv_lora_rank, self.num_heads * (self.qk_nope_head_dim + self.v_head_dim), + bias=False, quant_config=quant_config + ) + # O projection. + self.o_proj = RowParallelLinear( + self.num_heads * self.v_head_dim, self.hidden_size, + bias=False, quant_config=quant_config + ) + rope_scaling['type'] = 'deepseek_yarn' + self.rotary_emb = get_rope( + qk_rope_head_dim, + rotary_dim=qk_rope_head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + is_neox_style=False + ) + + if rope_scaling: + mscale_all_dim = rope_scaling.get("mscale_all_dim", False) + scaling_factor = rope_scaling["factor"] + mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) + self.scaling = self.scaling * mscale * mscale + + # self.attn = Attention(self.num_heads, + # self.qk_head_dim, + # self.scaling, + # num_kv_heads=self.num_heads) + + # TODO, support head_size 192 + self.attn = Attention(self.num_local_heads, + 256, + self.scaling, + num_kv_heads=self.num_local_heads) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + + q = self.q_a_proj(hidden_states)[0] + q = self.q_a_layernorm(q) + q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim) + q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0] + kv_a, _ = latent_cache.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + latent_cache = latent_cache.unsqueeze(1) + kv_a = self.kv_a_layernorm(kv_a.contiguous()) + kv = self.kv_b_proj(kv_a)[0] + kv = kv.view(-1, self.num_local_heads, self.qk_nope_head_dim + self.v_head_dim) + k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) + k_pe = latent_cache[:, :, self.kv_lora_rank:] + q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe) + q[..., self.qk_nope_head_dim:] = q_pe + k = torch.empty_like(q) + k[..., :self.qk_nope_head_dim] = k_nope + k[..., self.qk_nope_head_dim:] = k_pe + q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim], value = 0).view(-1, self.num_local_heads * 256) + k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim], value = 0).view(-1, self.num_local_heads * 256) + v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim], value = 0).view(-1, self.num_local_heads * 256) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) + attn_output = attn_output.view(-1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape(-1, self.num_local_heads * self.v_head_dim) + output, _ = self.o_proj(attn_output) + return output + + +class DeepseekV2DecoderLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + layer_idx: int, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + self.self_attn = DeepseekV2Attention( + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + qk_nope_head_dim=config.qk_nope_head_dim, + qk_rope_head_dim=config.qk_rope_head_dim, + v_head_dim=config.v_head_dim, + q_lora_rank=config.q_lora_rank, + kv_lora_rank=config.kv_lora_rank, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + quant_config=quant_config, + layer_idx = layer_idx, + ) + if (config.n_routed_experts is not None + and layer_idx >= config.first_k_dense_replace + and layer_idx % config.moe_layer_freq == 0): + self.mlp = DeepseekV2MoE(config=config, quant_config=quant_config) + else: + self.mlp = DeepseekV2MLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + ) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + residual: Optional[torch.Tensor], + ) -> torch.Tensor: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +class DeepseekV2Model(nn.Module): + + fall_back_to_pt_during_load = False + + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.layers = nn.ModuleList([ + DeepseekV2DecoderLayer(config, layer_idx, quant_config=quant_config) + for layer_idx in range(config.num_hidden_layers) + ]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + hidden_states = self.embed_tokens(input_ids) + residual = None + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states, residual = layer(positions, hidden_states, + kv_caches[i], attn_metadata, + residual) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class DeepseekV2ForCausalLM(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.config = config + self.quant_config = quant_config + self.model = DeepseekV2Model(config, quant_config) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + self.logits_processor = LogitsProcessor(config.vocab_size) + self.sampler = Sampler() + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + attn_metadata) + return hidden_states + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head.weight, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: Optional[torch.Tensor], + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Skip experts that are not assigned to this worker. + if (("mlp.experts." in name or "mlp.shared_experts." in name) + and name not in params_dict): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Skip experts that are not assigned to this worker. + if (("mlp.experts." in name or "mlp.shared_experts." in name) + and name not in params_dict): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) From 2bcfba804e368a65f6a75f5a5d9b448bfdc0e3c0 Mon Sep 17 00:00:00 2001 From: zengwangding Date: Thu, 16 May 2024 15:08:41 +0800 Subject: [PATCH 02/17] add support for q_proj without lora --- vllm/model_executor/models/deepseek_v2.py | 37 ++++++++++++++--------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index e59e0bcf2f2ca..705bc9b118d1d 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -205,15 +205,22 @@ def __init__( self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings - self.q_a_proj = ReplicatedLinear( - self.hidden_size, self.q_lora_rank, - bias=False, quant_config=quant_config - ) - self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps) - self.q_b_proj = ColumnParallelLinear( - q_lora_rank, self.num_heads * self.qk_head_dim, - bias=False, quant_config=quant_config - ) + if self.q_lora_rank is not None: + self.q_a_proj = ReplicatedLinear( + self.hidden_size, self.q_lora_rank, + bias=False, quant_config=quant_config + ) + self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps) + self.q_b_proj = ColumnParallelLinear( + q_lora_rank, self.num_heads * self.qk_head_dim, + bias=False, quant_config=quant_config + ) + else: + self.q_proj = ColumnParallelLinear( + self.hidden_size, self.num_heads * self.qk_head_dim, + bias=False, quant_config=quant_config + ) + self.kv_a_proj_with_mqa = ReplicatedLinear( self.hidden_size, self.kv_lora_rank + self.qk_rope_head_dim, bias=False, quant_config=quant_config @@ -262,10 +269,12 @@ def forward( kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, ) -> torch.Tensor: - - q = self.q_a_proj(hidden_states)[0] - q = self.q_a_layernorm(q) - q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim) + if self.q_lora_rank is not None: + q = self.q_a_proj(hidden_states)[0] + q = self.q_a_layernorm(q) + q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim) + else: + q = self.q_proj(hidden_states)[0].view(-1, self.num_local_heads, self.qk_head_dim) q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0] kv_a, _ = latent_cache.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) @@ -310,7 +319,7 @@ def __init__( qk_nope_head_dim=config.qk_nope_head_dim, qk_rope_head_dim=config.qk_rope_head_dim, v_head_dim=config.v_head_dim, - q_lora_rank=config.q_lora_rank, + q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None, kv_lora_rank=config.kv_lora_rank, rope_theta=rope_theta, rope_scaling=rope_scaling, From 36425b01e78956b4f2001f7c64dd20da997c7aa3 Mon Sep 17 00:00:00 2001 From: zengwangding Date: Sat, 18 May 2024 16:18:01 +0800 Subject: [PATCH 03/17] fix up --- vllm/config.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 600c53ecd82c7..3e736c990abd7 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -250,8 +250,9 @@ def get_hidden_size(self) -> int: return self.hf_text_config.hidden_size def get_head_size(self) -> int: + # TODO remove hard code if hasattr(self.hf_text_config, "model_type") and self.hf_text_config.model_type=='deepseek_v2': - # FlashAttention suports only 32, 64, 128, 256, we need pading 192 to 256 + # FlashAttention suports only head_size 32, 64, 128, 256, we need to pad head_size 192 to 256 return 256 if hasattr(self.hf_text_config, "head_dim"): return self.hf_text_config.head_dim @@ -265,8 +266,6 @@ def get_total_num_kv_heads(self) -> int: # NOTE: for falcon, when new_decoder_architecture is True, the # multi_query flag is ignored and we use n_head_kv for the number of # KV heads. - if hasattr(self.hf_text_config, "model_type") and self.hf_text_config.model_type=='deepseek_v2': - return self.hf_text_config.num_attention_heads falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"] new_decoder_arch_falcon = ( self.hf_config.model_type in falcon_model_types @@ -312,8 +311,6 @@ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int: def get_num_attention_heads(self, parallel_config: "ParallelConfig") -> int: - if hasattr(self.hf_text_config, "model_type") and self.hf_text_config.model_type=='deepseek_v2': - return self.hf_text_config.num_attention_heads return self.hf_text_config.num_attention_heads // \ parallel_config.tensor_parallel_size From 28199d88a6b1a20c562bea4ee498874b009c67a5 Mon Sep 17 00:00:00 2001 From: zengwangding Date: Sat, 18 May 2024 16:28:36 +0800 Subject: [PATCH 04/17] fix up --- vllm/model_executor/layers/fused_moe/fused_moe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index eb513f36ce8cb..9075596adf221 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -490,6 +490,8 @@ def fused_moe( w2_scale: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, + num_expert_group: int = 0, + topk_group: int = 0, ) -> torch.Tensor: """ This function computes a Mixture of Experts (MoE) layer using two sets of @@ -521,7 +523,7 @@ def fused_moe( assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch" topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk, - renormalize) + renormalize, num_expert_group, topk_group) return fused_experts(hidden_states, w1, w2, From 434d757589d6512eb0c2d8337e12c65aa95794ec Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Tue, 21 May 2024 12:09:00 -0700 Subject: [PATCH 05/17] format --- vllm/config.py | 3 +- .../layers/fused_moe/fused_moe.py | 37 +++-- .../model_executor/layers/rotary_embedding.py | 20 +-- vllm/model_executor/models/__init__.py | 2 - vllm/model_executor/models/deepseek_v2.py | 156 +++++++++++------- 5 files changed, 128 insertions(+), 90 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 3e736c990abd7..0b8f0b0b72e1f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -251,7 +251,8 @@ def get_hidden_size(self) -> int: def get_head_size(self) -> int: # TODO remove hard code - if hasattr(self.hf_text_config, "model_type") and self.hf_text_config.model_type=='deepseek_v2': + if hasattr(self.hf_text_config, "model_type" + ) and self.hf_text_config.model_type == 'deepseek_v2': # FlashAttention suports only head_size 32, 64, 128, 256, we need to pad head_size 192 to 256 return 256 if hasattr(self.hf_text_config, "head_dim"): diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 9075596adf221..27a5df67ef18c 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -314,7 +314,7 @@ def fused_topk( topk: int, renormalize: bool, num_expert_group: int = 0, - topk_group: int = 0, + topk_group: int = 0, ): assert hidden_states.shape[0] == gating_output.shape[0], ( "Number of tokens mismatch") @@ -332,13 +332,13 @@ def fused_topk( import vllm._moe_C as moe_kernels topk_weights = torch.empty(M, - topk, - dtype=torch.float32, - device=hidden_states.device) + topk, + dtype=torch.float32, + device=hidden_states.device) topk_ids = torch.empty(M, - topk, - dtype=torch.int32, - device=hidden_states.device) + topk, + dtype=torch.int32, + device=hidden_states.device) token_expert_indicies = torch.empty(M, topk, dtype=torch.int32, @@ -351,15 +351,25 @@ def fused_topk( ) del token_expert_indicies # Not used. Will be used in the future. else: - scores = torch.softmax(gating_output, dim = -1) + scores = torch.softmax(gating_output, dim=-1) num_token = scores.shape[0] - group_scores = scores.view(num_token, num_expert_group, -1).max(dim=-1).values # [n, n_group] - group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[1] # [n, top_k_group] + group_scores = scores.view(num_token, num_expert_group, + -1).max(dim=-1).values # [n, n_group] + group_idx = torch.topk(group_scores, + k=topk_group, + dim=-1, + sorted=False)[1] # [n, top_k_group] group_mask = torch.zeros_like(group_scores) # [n, n_group] group_mask.scatter_(1, group_idx, 1) # [n, n_group] - score_mask = group_mask.unsqueeze(-1).expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group).reshape(num_token, -1) # [n, e] + score_mask = group_mask.unsqueeze(-1).expand( + num_token, num_expert_group, + scores.shape[-1] // num_expert_group).reshape(num_token, + -1) # [n, e] tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e] - topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False) + topk_weights, topk_ids = torch.topk(tmp_scores, + k=topk, + dim=-1, + sorted=False) if renormalize: topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) @@ -523,7 +533,8 @@ def fused_moe( assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch" topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk, - renormalize, num_expert_group, topk_group) + renormalize, num_expert_group, + topk_group) return fused_experts(hidden_states, w1, w2, diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 0b2f90402d383..711658d649e33 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -488,8 +488,9 @@ def __init__( self.beta_slow = beta_slow # Get n-d magnitude scaling corrected for interpolation. self.mscale = float( - yarn_get_mscale(self.scaling_factor, float(mscale)) - / yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) * attn_factor) + yarn_get_mscale(self.scaling_factor, float(mscale)) / + yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) * + attn_factor) super().__init__(head_size, rotary_dim, max_position_embeddings, base, is_neox_style) @@ -505,7 +506,8 @@ def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor: self.max_position_embeddings) # Get n-d rotational scaling corrected for extrapolation inv_freq_mask = (1 - _yarn_linear_ramp_mask( - low, high, self.rotary_dim // 2, dtype=torch.float)) * self.extrapolation_factor + low, high, self.rotary_dim // 2, + dtype=torch.float)) * self.extrapolation_factor inv_freq = inv_freq_interpolation * ( 1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask return inv_freq @@ -522,7 +524,6 @@ def _compute_cos_sin_cache(self) -> torch.Tensor: print("Cache shape", cache.shape) return cache - def forward( self, positions: torch.Tensor, @@ -562,7 +563,8 @@ def forward( query = query_rot key = key_rot return query, key - + + _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {} @@ -627,11 +629,9 @@ def get_rope( if k in ("extrapolation_factor", "attn_factor", "beta_fast", "beta_slow", "mscale", "mscale_all_dim") } - rotary_emb = DeepseekScalingRotaryEmbedding(head_size, rotary_dim, - original_max_position, - base, is_neox_style, - scaling_factor, - **extra_kwargs) + rotary_emb = DeepseekScalingRotaryEmbedding( + head_size, rotary_dim, original_max_position, base, + is_neox_style, scaling_factor, **extra_kwargs) elif scaling_type == "su": short_factor = rope_scaling["short_factor"] long_factor = rope_scaling["long_factor"] diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 9070c01a37a28..05cd7a135f14e 100755 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -20,10 +20,8 @@ "CohereForCausalLM": ("commandr", "CohereForCausalLM"), "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"), "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), - "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"), "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"), - "FalconForCausalLM": ("falcon", "FalconForCausalLM"), "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"), "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"), diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 705bc9b118d1d..e3bee8b478f78 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -103,10 +103,10 @@ def __init__( self.experts = nn.ModuleList([ DeepseekV2MLP(hidden_size=config.hidden_size, - intermediate_size=config.moe_intermediate_size, - hidden_act=config.hidden_act, - quant_config=quant_config, - reduce_results=False) + intermediate_size=config.moe_intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=False) for idx in range(self.n_routed_experts) ]) self.pack_params() @@ -153,15 +153,16 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: shared_output = self.shared_experts(hidden_states) # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) - final_hidden_states = fused_moe(hidden_states, - self.w1, - self.w2, - router_logits, - self.top_k, - renormalize=self.config.norm_topk_prob, - inplace=True, - num_expert_group = self.config.n_group, - topk_group=self.config.topk_group) * self.routed_scaling_factor + final_hidden_states = fused_moe( + hidden_states, + self.w1, + self.w2, + router_logits, + self.top_k, + renormalize=self.config.norm_topk_prob, + inplace=True, + num_expert_group=self.config.n_group, + topk_group=self.config.topk_group) * self.routed_scaling_factor if self.config.n_shared_experts is not None: final_hidden_states = final_hidden_states + shared_output final_hidden_states = tensor_model_parallel_all_reduce( @@ -169,24 +170,31 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return final_hidden_states.view(num_tokens, hidden_dim) + def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: import math if scale <= 1: return 1.0 return 0.1 * mscale * math.log(scale) + 1.0 + class DeepseekV2Attention(nn.Module): + def __init__( self, config: PretrainedConfig, - hidden_size: int, num_heads: int, - qk_nope_head_dim: int, qk_rope_head_dim: int, v_head_dim: int, - q_lora_rank: int, kv_lora_rank: int, + hidden_size: int, + num_heads: int, + qk_nope_head_dim: int, + qk_rope_head_dim: int, + v_head_dim: int, + q_lora_rank: int, + kv_lora_rank: int, rope_theta: float = 10000, rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, - layer_idx = None, + layer_idx=None, ) -> None: super().__init__() self.layer_idx = layer_idx @@ -206,44 +214,48 @@ def __init__( self.max_position_embeddings = max_position_embeddings if self.q_lora_rank is not None: - self.q_a_proj = ReplicatedLinear( - self.hidden_size, self.q_lora_rank, - bias=False, quant_config=quant_config - ) - self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps) - self.q_b_proj = ColumnParallelLinear( - q_lora_rank, self.num_heads * self.qk_head_dim, - bias=False, quant_config=quant_config - ) + self.q_a_proj = ReplicatedLinear(self.hidden_size, + self.q_lora_rank, + bias=False, + quant_config=quant_config) + self.q_a_layernorm = RMSNorm(self.q_lora_rank, + eps=config.rms_norm_eps) + self.q_b_proj = ColumnParallelLinear(q_lora_rank, + self.num_heads * + self.qk_head_dim, + bias=False, + quant_config=quant_config) else: - self.q_proj = ColumnParallelLinear( - self.hidden_size, self.num_heads * self.qk_head_dim, - bias=False, quant_config=quant_config - ) - - self.kv_a_proj_with_mqa = ReplicatedLinear( - self.hidden_size, self.kv_lora_rank + self.qk_rope_head_dim, - bias=False, quant_config=quant_config - ) - self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps) + self.q_proj = ColumnParallelLinear(self.hidden_size, + self.num_heads * + self.qk_head_dim, + bias=False, + quant_config=quant_config) + + self.kv_a_proj_with_mqa = ReplicatedLinear(self.hidden_size, + self.kv_lora_rank + + self.qk_rope_head_dim, + bias=False, + quant_config=quant_config) + self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, + eps=config.rms_norm_eps) self.kv_b_proj = ColumnParallelLinear( - self.kv_lora_rank, self.num_heads * (self.qk_nope_head_dim + self.v_head_dim), - bias=False, quant_config=quant_config - ) + self.kv_lora_rank, + self.num_heads * (self.qk_nope_head_dim + self.v_head_dim), + bias=False, + quant_config=quant_config) # O projection. - self.o_proj = RowParallelLinear( - self.num_heads * self.v_head_dim, self.hidden_size, - bias=False, quant_config=quant_config - ) + self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim, + self.hidden_size, + bias=False, + quant_config=quant_config) rope_scaling['type'] = 'deepseek_yarn' - self.rotary_emb = get_rope( - qk_rope_head_dim, - rotary_dim=qk_rope_head_dim, - max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, - is_neox_style=False - ) + self.rotary_emb = get_rope(qk_rope_head_dim, + rotary_dim=qk_rope_head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + is_neox_style=False) if rope_scaling: mscale_all_dim = rope_scaling.get("mscale_all_dim", False) @@ -255,7 +267,7 @@ def __init__( # self.qk_head_dim, # self.scaling, # num_kv_heads=self.num_heads) - + # TODO, support head_size 192 self.attn = Attention(self.num_local_heads, 256, @@ -272,16 +284,21 @@ def forward( if self.q_lora_rank is not None: q = self.q_a_proj(hidden_states)[0] q = self.q_a_layernorm(q) - q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim) + q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, + self.qk_head_dim) else: - q = self.q_proj(hidden_states)[0].view(-1, self.num_local_heads, self.qk_head_dim) - q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + q = self.q_proj(hidden_states)[0].view(-1, self.num_local_heads, + self.qk_head_dim) + q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], + dim=-1) latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0] - kv_a, _ = latent_cache.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + kv_a, _ = latent_cache.split( + [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) latent_cache = latent_cache.unsqueeze(1) kv_a = self.kv_a_layernorm(kv_a.contiguous()) kv = self.kv_b_proj(kv_a)[0] - kv = kv.view(-1, self.num_local_heads, self.qk_nope_head_dim + self.v_head_dim) + kv = kv.view(-1, self.num_local_heads, + self.qk_nope_head_dim + self.v_head_dim) k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) k_pe = latent_cache[:, :, self.kv_lora_rank:] q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe) @@ -289,11 +306,19 @@ def forward( k = torch.empty_like(q) k[..., :self.qk_nope_head_dim] = k_nope k[..., self.qk_nope_head_dim:] = k_pe - q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim], value = 0).view(-1, self.num_local_heads * 256) - k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim], value = 0).view(-1, self.num_local_heads * 256) - v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim], value = 0).view(-1, self.num_local_heads * 256) + q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim], + value=0).view(-1, + self.num_local_heads * 256) + k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim], + value=0).view(-1, + self.num_local_heads * 256) + v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim], + value=0).view(-1, + self.num_local_heads * 256) attn_output = self.attn(q, k, v, kv_cache, attn_metadata) - attn_output = attn_output.view(-1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape(-1, self.num_local_heads * self.v_head_dim) + attn_output = attn_output.view( + -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape( + -1, self.num_local_heads * self.v_head_dim) output, _ = self.o_proj(attn_output) return output @@ -319,13 +344,14 @@ def __init__( qk_nope_head_dim=config.qk_nope_head_dim, qk_rope_head_dim=config.qk_rope_head_dim, v_head_dim=config.v_head_dim, - q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None, + q_lora_rank=config.q_lora_rank + if hasattr(config, "q_lora_rank") else None, kv_lora_rank=config.kv_lora_rank, rope_theta=rope_theta, rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, - layer_idx = layer_idx, + layer_idx=layer_idx, ) if (config.n_routed_experts is not None and layer_idx >= config.first_k_dense_replace @@ -390,7 +416,9 @@ def __init__( config.hidden_size, ) self.layers = nn.ModuleList([ - DeepseekV2DecoderLayer(config, layer_idx, quant_config=quant_config) + DeepseekV2DecoderLayer(config, + layer_idx, + quant_config=quant_config) for layer_idx in range(config.num_hidden_layers) ]) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) From ce3a80ad735255bd89903311b970ac89f1d358a8 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Tue, 21 May 2024 12:20:53 -0700 Subject: [PATCH 06/17] ruff --- vllm/config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index 0b8f0b0b72e1f..bcea74712ec21 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -253,7 +253,8 @@ def get_head_size(self) -> int: # TODO remove hard code if hasattr(self.hf_text_config, "model_type" ) and self.hf_text_config.model_type == 'deepseek_v2': - # FlashAttention suports only head_size 32, 64, 128, 256, we need to pad head_size 192 to 256 + # FlashAttention suports only head_size 32, 64, 128, 256, + # we need to pad head_size 192 to 256 return 256 if hasattr(self.hf_text_config, "head_dim"): return self.hf_text_config.head_dim From 59b6353a5d3bbcdef5e683d6889dac0612fa05d5 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Tue, 21 May 2024 13:23:34 -0700 Subject: [PATCH 07/17] refactor grouped moe --- .../layers/fused_moe/__init__.py | 3 +- .../layers/fused_moe/fused_moe.py | 100 ++++++++++-------- vllm/model_executor/models/deepseek_v2.py | 19 ++-- 3 files changed, 67 insertions(+), 55 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 2926c7d1c8a76..4ef55f24620c7 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -1,9 +1,10 @@ from vllm.model_executor.layers.fused_moe.fused_moe import ( - fused_experts, fused_moe, fused_topk, get_config_file_name) + fused_experts, fused_moe, fused_topk, grouped_topk, get_config_file_name) __all__ = [ "fused_moe", "fused_topk", + "grouped_topk", "fused_experts", "get_config_file_name", ] diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 27a5df67ef18c..d37e3999ed85c 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -313,8 +313,6 @@ def fused_topk( gating_output: torch.Tensor, topk: int, renormalize: bool, - num_expert_group: int = 0, - topk_group: int = 0, ): assert hidden_states.shape[0] == gating_output.shape[0], ( "Number of tokens mismatch") @@ -328,48 +326,61 @@ def fused_topk( dtype=torch.float32) topk_weights, topk_ids = torch.topk(routing_weights, topk, dim=-1) else: - if num_expert_group == 0: - import vllm._moe_C as moe_kernels - - topk_weights = torch.empty(M, - topk, - dtype=torch.float32, - device=hidden_states.device) - topk_ids = torch.empty(M, + import vllm._moe_C as moe_kernels + + topk_weights = torch.empty(M, topk, - dtype=torch.int32, + dtype=torch.float32, device=hidden_states.device) - token_expert_indicies = torch.empty(M, - topk, - dtype=torch.int32, - device=hidden_states.device) - moe_kernels.topk_softmax( - topk_weights, - topk_ids, - token_expert_indicies, - gating_output.float(), # TODO(woosuk): Optimize this. - ) - del token_expert_indicies # Not used. Will be used in the future. - else: - scores = torch.softmax(gating_output, dim=-1) - num_token = scores.shape[0] - group_scores = scores.view(num_token, num_expert_group, - -1).max(dim=-1).values # [n, n_group] - group_idx = torch.topk(group_scores, - k=topk_group, - dim=-1, - sorted=False)[1] # [n, top_k_group] - group_mask = torch.zeros_like(group_scores) # [n, n_group] - group_mask.scatter_(1, group_idx, 1) # [n, n_group] - score_mask = group_mask.unsqueeze(-1).expand( - num_token, num_expert_group, - scores.shape[-1] // num_expert_group).reshape(num_token, - -1) # [n, e] - tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e] - topk_weights, topk_ids = torch.topk(tmp_scores, - k=topk, - dim=-1, - sorted=False) + topk_ids = torch.empty(M, + topk, + dtype=torch.int32, + device=hidden_states.device) + token_expert_indicies = torch.empty(M, + topk, + dtype=torch.int32, + device=hidden_states.device) + moe_kernels.topk_softmax( + topk_weights, + topk_ids, + token_expert_indicies, + gating_output.float(), # TODO(woosuk): Optimize this. + ) + del token_expert_indicies # Not used. Will be used in the future. + + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + return topk_weights, topk_ids + + +# This is used by the Deepseek-V2 model +def grouped_topk( + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + num_expert_group: int = 0, + topk_group: int = 0, +): + scores = torch.softmax(gating_output, dim=-1) + num_token = scores.shape[0] + group_scores = scores.view(num_token, num_expert_group, + -1).max(dim=-1).values # [n, n_group] + group_idx = torch.topk(group_scores, + k=topk_group, + dim=-1, + sorted=False)[1] # [n, top_k_group] + group_mask = torch.zeros_like(group_scores) # [n, n_group] + group_mask.scatter_(1, group_idx, 1) # [n, n_group] + score_mask = group_mask.unsqueeze(-1).expand( + num_token, num_expert_group, + scores.shape[-1] // num_expert_group).reshape(num_token, + -1) # [n, e] + tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e] + topk_weights, topk_ids = torch.topk(tmp_scores, + k=topk, + dim=-1, + sorted=False) if renormalize: topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) @@ -500,8 +511,6 @@ def fused_moe( w2_scale: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, - num_expert_group: int = 0, - topk_group: int = 0, ) -> torch.Tensor: """ This function computes a Mixture of Experts (MoE) layer using two sets of @@ -533,8 +542,7 @@ def fused_moe( assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch" topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk, - renormalize, num_expert_group, - topk_group) + renormalize) return fused_experts(hidden_states, w1, w2, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index e3bee8b478f78..7bcc3c50f11fd 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -32,7 +32,7 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.fused_moe import fused_experts, grouped_topk from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, ReplicatedLinear, @@ -153,16 +153,19 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: shared_output = self.shared_experts(hidden_states) # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) - final_hidden_states = fused_moe( + topk_weights, topk_ids = grouped_topk( + hidden_states, router_logits, self.top_k, + renormalize=self.config.norm_topk_prob, + num_expert_group=self.config.n_group, + topk_group=self.config.topk_group + ) + final_hidden_states = fused_experts( hidden_states, self.w1, self.w2, - router_logits, - self.top_k, - renormalize=self.config.norm_topk_prob, - inplace=True, - num_expert_group=self.config.n_group, - topk_group=self.config.topk_group) * self.routed_scaling_factor + topk_weights, + topk_ids, + inplace=True) * self.routed_scaling_factor if self.config.n_shared_experts is not None: final_hidden_states = final_hidden_states + shared_output final_hidden_states = tensor_model_parallel_all_reduce( From 1ce0c2adf7c532cb5a630f23d9ba3c87220264e8 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Tue, 21 May 2024 13:27:54 -0700 Subject: [PATCH 08/17] format --- vllm/model_executor/layers/fused_moe/fused_moe.py | 9 +++------ vllm/model_executor/models/deepseek_v2.py | 7 ++++--- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index d37e3999ed85c..6d6d469582c35 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -360,22 +360,19 @@ def grouped_topk( topk: int, renormalize: bool, num_expert_group: int = 0, - topk_group: int = 0, + topk_group: int = 0, ): scores = torch.softmax(gating_output, dim=-1) num_token = scores.shape[0] group_scores = scores.view(num_token, num_expert_group, -1).max(dim=-1).values # [n, n_group] - group_idx = torch.topk(group_scores, - k=topk_group, - dim=-1, + group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[1] # [n, top_k_group] group_mask = torch.zeros_like(group_scores) # [n, n_group] group_mask.scatter_(1, group_idx, 1) # [n, n_group] score_mask = group_mask.unsqueeze(-1).expand( num_token, num_expert_group, - scores.shape[-1] // num_expert_group).reshape(num_token, - -1) # [n, e] + scores.shape[-1] // num_expert_group).reshape(num_token, -1) # [n, e] tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e] topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 7bcc3c50f11fd..eb34bd3df9056 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -154,11 +154,12 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) topk_weights, topk_ids = grouped_topk( - hidden_states, router_logits, self.top_k, + hidden_states, + router_logits, + self.top_k, renormalize=self.config.norm_topk_prob, num_expert_group=self.config.n_group, - topk_group=self.config.topk_group - ) + topk_group=self.config.topk_group) final_hidden_states = fused_experts( hidden_states, self.w1, From bf988622682591246e06bcc800bb5e881433338e Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Tue, 21 May 2024 13:29:15 -0700 Subject: [PATCH 09/17] typo --- vllm/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index bcea74712ec21..fe32af3138d63 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -253,7 +253,7 @@ def get_head_size(self) -> int: # TODO remove hard code if hasattr(self.hf_text_config, "model_type" ) and self.hf_text_config.model_type == 'deepseek_v2': - # FlashAttention suports only head_size 32, 64, 128, 256, + # FlashAttention supports only head_size 32, 64, 128, 256, # we need to pad head_size 192 to 256 return 256 if hasattr(self.hf_text_config, "head_dim"): From ca9c0ee1165980a803d1f67e274fd030956b011d Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Tue, 21 May 2024 13:41:06 -0700 Subject: [PATCH 10/17] import order --- vllm/model_executor/layers/fused_moe/__init__.py | 4 ++-- vllm/model_executor/models/deepseek_v2.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 4ef55f24620c7..1dafae503dfb2 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -1,10 +1,10 @@ from vllm.model_executor.layers.fused_moe.fused_moe import ( - fused_experts, fused_moe, fused_topk, grouped_topk, get_config_file_name) + fused_experts, fused_moe, fused_topk, get_config_file_name, grouped_topk) __all__ = [ "fused_moe", "fused_topk", - "grouped_topk", "fused_experts", "get_config_file_name", + "grouped_topk", ] diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index eb34bd3df9056..0b96396417787 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -34,10 +34,10 @@ from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_experts, grouped_topk from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, ReplicatedLinear, - RowParallelLinear, - ColumnParallelLinear) + RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) From 0746b4f3c8ca797d341ad077028ea7197bc32561 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 27 Jun 2024 21:15:03 -0700 Subject: [PATCH 11/17] Update fused_moe.py --- vllm/model_executor/layers/fused_moe/fused_moe.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 2fc373a855010..b750fc713b43f 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -346,11 +346,15 @@ def fused_topk( topk, dtype=torch.float32, device=hidden_states.device) + topk_ids = torch.empty(M, + topk, + dtype=torch.int32, + device=hidden_states.device) token_expert_indicies = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device) - moe_kernels.topk_softmax( + ops.topk_softmax( topk_weights, topk_ids, token_expert_indicies, From 2443f275a2ad4c284ba27eeb09778dbd6648daef Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 27 Jun 2024 21:24:32 -0700 Subject: [PATCH 12/17] Update rotary_embedding.py --- vllm/model_executor/layers/rotary_embedding.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 0eaac967e4bb4..6b8ee8d965401 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -630,11 +630,12 @@ def __init__( base: int, is_neox_style: bool, scaling_factor: float, + dtype: torch.dtype, *, extrapolation_factor: float = 1, attn_factor: float = 1, - beta_fast: float = 32, - beta_slow: float = 1, + beta_fast: int = 32, + beta_slow: int = 1, mscale: float = 1, mscale_all_dim: float = 0, ) -> None: @@ -649,7 +650,7 @@ def __init__( yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) * attn_factor) super().__init__(head_size, rotary_dim, max_position_embeddings, base, - is_neox_style) + is_neox_style, dtype) def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor: pos_freqs = self.base**(torch.arange( From 44f087c8747911e2fdc85030707479b01171ba6d Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 27 Jun 2024 21:41:39 -0700 Subject: [PATCH 13/17] Update deepseek_v2.py --- vllm/model_executor/models/deepseek_v2.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 0b96396417787..0a7342bd5ce10 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -197,6 +197,7 @@ def __init__( rope_theta: float = 10000, rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, + cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, layer_idx=None, ) -> None: @@ -276,7 +277,9 @@ def __init__( self.attn = Attention(self.num_local_heads, 256, self.scaling, - num_kv_heads=self.num_local_heads) + num_kv_heads=self.num_local_heads, + cache_config=cache_config, + quant_config=quant_config) def forward( self, @@ -333,6 +336,7 @@ def __init__( self, config: PretrainedConfig, layer_idx: int, + cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() @@ -354,6 +358,7 @@ def __init__( rope_theta=rope_theta, rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, + cache_config=cache_config, quant_config=quant_config, layer_idx=layer_idx, ) @@ -409,6 +414,7 @@ class DeepseekV2Model(nn.Module): def __init__( self, config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() @@ -422,6 +428,7 @@ def __init__( self.layers = nn.ModuleList([ DeepseekV2DecoderLayer(config, layer_idx, + cache_config=cache_config, quant_config=quant_config) for layer_idx in range(config.num_hidden_layers) ]) @@ -450,12 +457,13 @@ class DeepseekV2ForCausalLM(nn.Module): def __init__( self, config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, ) -> None: super().__init__() self.config = config self.quant_config = quant_config - self.model = DeepseekV2Model(config, quant_config) + self.model = DeepseekV2Model(config, cache_config, quant_config) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = Sampler() From df65a69fa2de8fb4ad162b99146d01e86bef62bc Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 27 Jun 2024 21:42:56 -0700 Subject: [PATCH 14/17] Update deepseek_v2.py --- vllm/model_executor/models/deepseek_v2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 0a7342bd5ce10..3d4f78c664776 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -28,6 +28,7 @@ from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata +from vllm.config import CacheConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) From 1d90229df6b584da5816e13b7612077242e3b7b6 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 27 Jun 2024 21:46:20 -0700 Subject: [PATCH 15/17] Update rotary_embedding.py --- vllm/model_executor/layers/rotary_embedding.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 6b8ee8d965401..91fa16ce4a492 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -804,7 +804,8 @@ def get_rope( } rotary_emb = DeepseekScalingRotaryEmbedding( head_size, rotary_dim, original_max_position, base, - is_neox_style, scaling_factor, **extra_kwargs) + is_neox_style, scaling_factor, + dtype=torch.get_default_dtype(), *extra_kwargs) # The correct one should be "longrope" but keep "su" here # for backward compatible elif scaling_type == "su" or scaling_type == "longrope": From e06d0d2a7eb42f0f644f6e3a227306524c3e8e19 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 27 Jun 2024 21:50:24 -0700 Subject: [PATCH 16/17] Update rotary_embedding.py --- vllm/model_executor/layers/rotary_embedding.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 91fa16ce4a492..c9d21ed6e3f60 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -804,8 +804,7 @@ def get_rope( } rotary_emb = DeepseekScalingRotaryEmbedding( head_size, rotary_dim, original_max_position, base, - is_neox_style, scaling_factor, - dtype=torch.get_default_dtype(), *extra_kwargs) + is_neox_style, scaling_factor, dtype, *extra_kwargs) # The correct one should be "longrope" but keep "su" here # for backward compatible elif scaling_type == "su" or scaling_type == "longrope": From 703e6a310739a9692b5db5510f721b56fbcaab8c Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Thu, 27 Jun 2024 21:51:39 -0700 Subject: [PATCH 17/17] Update rotary_embedding.py --- vllm/model_executor/layers/rotary_embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index c9d21ed6e3f60..1285627ec3cc5 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -804,7 +804,7 @@ def get_rope( } rotary_emb = DeepseekScalingRotaryEmbedding( head_size, rotary_dim, original_max_position, base, - is_neox_style, scaling_factor, dtype, *extra_kwargs) + is_neox_style, scaling_factor, dtype, **extra_kwargs) # The correct one should be "longrope" but keep "su" here # for backward compatible elif scaling_type == "su" or scaling_type == "longrope":