diff --git a/install/install_requirements.sh b/install/install_requirements.sh index 264c3496d..360ba1801 100755 --- a/install/install_requirements.sh +++ b/install/install_requirements.sh @@ -51,13 +51,13 @@ echo "Using pip executable: $PIP_EXECUTABLE" # NOTE: If a newly-fetched version of the executorch repo changes the value of # PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary # package versions. -PYTORCH_NIGHTLY_VERSION=dev20250119 +PYTORCH_NIGHTLY_VERSION=dev20250124 # Nightly version for torchvision -VISION_NIGHTLY_VERSION=dev20250119 +VISION_NIGHTLY_VERSION=dev20250124 # Nightly version for torchtune -TUNE_NIGHTLY_VERSION=dev20250119 +TUNE_NIGHTLY_VERSION=dev20250124 # The pip repository that hosts nightly torch packages. cpu by default. # If cuda is available, based on presence of nvidia-smi, install the pytorch nightly diff --git a/torchchat/model.py b/torchchat/model.py index c01ff1262..ce7dcb5e4 100644 --- a/torchchat/model.py +++ b/torchchat/model.py @@ -657,7 +657,7 @@ def __init__(self, config: TransformerArgs) -> None: self.layers[str(layer_id)] = TransformerBlock(config) if config.stage_idx == config.n_stages - 1: - self.norm = RMSNorm(config.dim, eps=config.norm_eps) + self.norm = nn.RMSNorm(config.dim, eps=config.norm_eps) self.output = nn.Linear(config.dim, config.vocab_size, bias=False) if config.tie_word_embeddings: self.output.weight = self.tok_embeddings.weight @@ -751,8 +751,8 @@ def __init__(self, config: TransformerArgs) -> None: super().__init__() self.attention = Attention(config) self.feed_forward = FeedForward(config) - self.ffn_norm = RMSNorm(config.dim, config.norm_eps) - self.attention_norm = RMSNorm(config.dim, config.norm_eps) + self.ffn_norm = nn.RMSNorm(config.dim, config.norm_eps) + self.attention_norm = nn.RMSNorm(config.dim, config.norm_eps) # None for llama architecture, set for granite architectures self.residual_multiplier = ( config.residual_multiplier @@ -928,20 +928,6 @@ def forward(self, x: Tensor) -> Tensor: return self.w2(F.silu(self.w1(x)) * self.w3(x)) -class RMSNorm(nn.Module): - def __init__(self, dim: int, eps: float = 1e-5): - super().__init__() - self.eps = eps - self.weight = nn.Parameter(torch.ones(dim)) - - def _norm(self, x): - return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps) - - def forward(self, x: Tensor) -> Tensor: - output = self._norm(x.float()).type_as(x) - return output * self.weight - - def apply_scaling(freqs: torch.Tensor, rope_scaling: Dict[str, Any]): # Check for the presence of the required keys required_keys = {