huggingface · tomaarsen · Nov 10, 2023 · Jan 11, 2023 · Jan 11, 2023 · Jan 11, 2023
diff --git a/src/setfit/__init__.py b/src/setfit/__init__.py
@@ -1,6 +1,12 @@
 __version__ = "0.6.0.dev0"
 
+import warnings
+
 from .data import add_templated_examples, sample_dataset
 from .modeling import SetFitHead, SetFitModel
-from .trainer import SetFitTrainer
-from .trainer_distillation import DistillationSetFitTrainer
+from .trainer import SetFitTrainer, Trainer
+from .trainer_distillation import DistillationSetFitTrainer, DistillationTrainer
+
+
+# Ensure that DeprecationWarnings are always shown
+warnings.filterwarnings("default", category=DeprecationWarning)
diff --git a/src/setfit/modeling.py b/src/setfit/modeling.py
@@ -1,4 +1,5 @@
 import os
+import warnings
 from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
@@ -14,14 +15,14 @@
 import numpy as np
 import requests
 import torch
-import torch.nn as nn
 from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
 from sentence_transformers import InputExample, SentenceTransformer, models
 from sklearn.linear_model import LogisticRegression
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.multioutput import ClassifierChain, MultiOutputClassifier
+from torch import nn
 from torch.utils.data import DataLoader
-from tqdm.auto import tqdm
+from tqdm.auto import tqdm, trange
 
 from . import logging
 from .data import SetFitDataset
@@ -216,7 +217,7 @@ def predict(self, x_test: torch.Tensor) -> torch.Tensor:
             return torch.where(probs >= 0.5, 1, 0)
         return torch.argmax(probs, dim=-1)
 
-    def get_loss_fn(self):
+    def get_loss_fn(self) -> nn.Module:
         if self.multitarget:  # if sigmoid output
             return torch.nn.BCEWithLogitsLoss()
         return torch.nn.CrossEntropyLoss()
@@ -242,9 +243,9 @@ def get_config_dict(self) -> Dict[str, Optional[Union[int, float, bool]]]:
     @staticmethod
     def _init_weight(module):
         if isinstance(module, nn.Linear):
-            torch.nn.init.xavier_uniform_(module.weight)
+            nn.init.xavier_uniform_(module.weight)
             if module.bias is not None:
-                torch.nn.init.constant_(module.bias, 1e-2)
+                nn.init.constant_(module.bias, 1e-2)
 
     def __repr__(self):
         return "SetFitHead({})".format(self.get_config_dict())
@@ -280,25 +281,29 @@ def fit(
         self,
         x_train: List[str],
         y_train: Union[List[int], List[List[int]]],
-        num_epochs: int,
-        batch_size: Optional[int] = None,
-        learning_rate: Optional[float] = None,
-        body_learning_rate: Optional[float] = None,
+        classifier_num_epochs: int,
+        classifier_batch_size: Optional[int] = None,
+        body_classifier_learning_rate: Optional[float] = None,
+        head_learning_rate: Optional[float] = None,
         l2_weight: Optional[float] = None,
         max_length: Optional[int] = None,
-        show_progress_bar: Optional[bool] = None,
+        show_progress_bar: bool = True,
+        end_to_end: bool = False,
+        **kwargs,
     ) -> None:
         if self.has_differentiable_head:  # train with pyTorch
             device = self.model_body.device
             self.model_body.train()
             self.model_head.train()
+            if not end_to_end:
+                self.freeze("body")
 
-            dataloader = self._prepare_dataloader(x_train, y_train, batch_size, max_length)
+            dataloader = self._prepare_dataloader(x_train, y_train, classifier_batch_size, max_length)
             criterion = self.model_head.get_loss_fn()
-            optimizer = self._prepare_optimizer(learning_rate, body_learning_rate, l2_weight)
+            optimizer = self._prepare_optimizer(head_learning_rate, body_classifier_learning_rate, l2_weight)
             scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
-            for epoch_idx in tqdm(range(num_epochs), desc="Epoch", disable=not show_progress_bar):
-                for batch in dataloader:
+            for epoch_idx in trange(classifier_num_epochs, desc="Epoch", disable=not show_progress_bar):
+                for batch in tqdm(dataloader, desc="Iteration", disable=not show_progress_bar, leave=False):
                     features, labels = batch
                     optimizer.zero_grad()
 
@@ -308,15 +313,18 @@ def fit(
 
                     outputs = self.model_body(features)
                     if self.normalize_embeddings:
-                        outputs = torch.nn.functional.normalize(outputs, p=2, dim=1)
+                        outputs = nn.functional.normalize(outputs, p=2, dim=1)
                     outputs = self.model_head(outputs)
                     logits = outputs["logits"]
 
-                    loss = criterion(logits, labels)
+                    loss: torch.Tensor = criterion(logits, labels)
                     loss.backward()
                     optimizer.step()
 
                 scheduler.step()
+
+            if not end_to_end:
+                self.unfreeze("body")
         else:  # train with sklearn
             embeddings = self.model_body.encode(x_train, normalize_embeddings=self.normalize_embeddings)
             self.model_head.fit(embeddings, y_train)
@@ -359,16 +367,20 @@ def _prepare_dataloader(
 
     def _prepare_optimizer(
         self,
-        learning_rate: float,
-        body_learning_rate: Optional[float],
+        head_learning_rate: float,
+        body_classifier_learning_rate: Optional[float],
         l2_weight: float,
     ) -> torch.optim.Optimizer:
-        body_learning_rate = body_learning_rate or learning_rate
+        body_classifier_learning_rate = body_classifier_learning_rate or head_learning_rate
         l2_weight = l2_weight or self.l2_weight
         optimizer = torch.optim.AdamW(
             [
-                {"params": self.model_body.parameters(), "lr": body_learning_rate, "weight_decay": l2_weight},
-                {"params": self.model_head.parameters(), "lr": learning_rate, "weight_decay": l2_weight},
+                {
+                    "params": self.model_body.parameters(),
+                    "lr": body_classifier_learning_rate,
+                    "weight_decay": l2_weight,
+                },
+                {"params": self.model_head.parameters(), "lr": head_learning_rate, "weight_decay": l2_weight},
             ],
         )
 
@@ -378,25 +390,40 @@ def freeze(self, component: Optional[Literal["body", "head"]] = None) -> None:
         if component is None or component == "body":
             self._freeze_or_not(self.model_body, to_freeze=True)
 
-        if component is None or component == "head":
+        if (component is None or component == "head") and self.has_differentiable_head:
             self._freeze_or_not(self.model_head, to_freeze=True)
 
-    def unfreeze(self, component: Optional[Literal["body", "head"]] = None) -> None:
+    def unfreeze(
+        self, component: Optional[Literal["body", "head"]] = None, keep_body_frozen: Optional[bool] = None
+    ) -> None:
+        if keep_body_frozen is not None:
+            warnings.warn(
+                '`keep_body_frozen` is deprecated. Please either pass "head", "body" or no arguments to unfreeze both.',
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            # If the body must stay frozen, only unfreeze the head. Eventually, this entire if-branch
+            # can be removed.
+            if keep_body_frozen and not component:
+                component = "head"
+
         if component is None or component == "body":
             self._freeze_or_not(self.model_body, to_freeze=False)
 
-        if component is None or component == "head":
+        if (component is None or component == "head") and self.has_differentiable_head:
             self._freeze_or_not(self.model_head, to_freeze=False)
 
-    def _freeze_or_not(self, model: torch.nn.Module, to_freeze: bool) -> None:
+    def _freeze_or_not(self, model: nn.Module, to_freeze: bool) -> None:
         for param in model.parameters():
             param.requires_grad = not to_freeze
 
-    def predict(self, x_test: List[str], as_numpy: bool = False) -> Union[torch.Tensor, "ndarray"]:
-        embeddings = self.model_body.encode(
-            x_test, normalize_embeddings=self.normalize_embeddings, convert_to_tensor=self.has_differentiable_head
+    def encode(self, inputs: List[str]) -> Union[torch.Tensor, "ndarray"]:
+        return self.model_body.encode(
+            inputs, normalize_embeddings=self.normalize_embeddings, convert_to_tensor=self.has_differentiable_head
         )
 
+    def predict(self, inputs: List[str], as_numpy: bool = False) -> Union[torch.Tensor, "ndarray"]:
+        embeddings = self.encode(inputs)
         outputs = self.model_head.predict(embeddings)
 
         if as_numpy and self.has_differentiable_head:
@@ -406,11 +433,8 @@ def predict(self, x_test: List[str], as_numpy: bool = False) -> Union[torch.Tens
 
         return outputs
 
-    def predict_proba(self, x_test: List[str], as_numpy: bool = False) -> Union[torch.Tensor, "ndarray"]:
-        embeddings = self.model_body.encode(
-            x_test, normalize_embeddings=self.normalize_embeddings, convert_to_tensor=self.has_differentiable_head
-        )
-
+    def predict_proba(self, inputs: List[str], as_numpy: bool = False) -> Union[torch.Tensor, "ndarray"]:
+        embeddings = self.encode(inputs)
         outputs = self.model_head.predict_proba(embeddings)
 
         if as_numpy and self.has_differentiable_head:
@@ -429,6 +453,9 @@ def to(self, device: Union[str, torch.device]) -> "SetFitModel":
         Returns:
             SetFitModel: Returns the original model, but now on the desired device.
         """
+        # Note that we must also set _target_device, or any SentenceTransformer.fit() call will reset
+        # the body location
+        self.model_body._target_device = device if isinstance(device, torch.device) else torch.device(device)
         self.model_body = self.model_body.to(device)
 
         if self.has_differentiable_head: