Skip to content

Commit

Permalink
Merge pull request #153 from KennethEnevoldsen/ensure-consistent-names
Browse files Browse the repository at this point in the history
Ensure consistent names
  • Loading branch information
KennethEnevoldsen authored Feb 19, 2024
2 parents e5a8964 + 157a91c commit 83fd962
Show file tree
Hide file tree
Showing 22 changed files with 230 additions and 162 deletions.
38 changes: 29 additions & 9 deletions docs/getting_started.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@
],
"source": [
"%%bash\n",
"seb run -m sentence-transformers/all-MiniLM-L6-v2 --output-path model_results/"
"seb run -m all-MiniLM-L6-v2 --output-path model_results/"
]
},
{
Expand Down Expand Up @@ -256,7 +256,7 @@
}
],
"source": [
"models = [seb.get_model(\"sentence-transformers/all-MiniLM-L6-v2\")]\n",
"models = [seb.get_model(\"all-MiniLM-L6-v2\")]\n",
"# for simplicity, we will only run it with one model, but you could run it with multiple models:\n",
"# models = seb.get_all_models()\n",
"\n",
Expand Down Expand Up @@ -309,18 +309,38 @@
"outputs": [],
"source": [
"from sentence_transformers import SentenceTransformer\n",
"\n",
"from typing import Any\n",
"import seb\n",
"import numpy as np\n",
"\n",
"\n",
"model_name = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
"\n",
"\n",
"model_name = \"sentence-transformers/all-MiniLM-L12-v2\"\n",
"class MyEncoder(seb.Encoder):\n",
" \"\"\"\n",
" A custom model for SEB that uses the SentenceTransformer library.\n",
" \"\"\"\n",
"\n",
" def __init__(self):\n",
" self.model = SentenceTransformer(model_name)\n",
"\n",
"def get_my_model() -> SentenceTransformer:\n",
" return SentenceTransformer(model_name)\n",
" def encode( # type: ignore\n",
" self,\n",
" sentences: list[str],\n",
" *,\n",
" task: seb.Task,\n",
" **kwargs: Any,\n",
" ) -> np.ndarray:\n",
" if task.name == \"DKHate\": # allow you to embed differently based on the task\n",
" emb = self.model.encode(sentences, batch_size=32, **kwargs)\n",
" else:\n",
" emb = self.model.encode(sentences, batch_size=32, **kwargs) # here we just do the same for all tasks\n",
" return emb\n",
"\n",
"\n",
"@seb.models.register(model_name) # add the model to the registry\n",
"def create_all_mini_lm_l6_v2() -> seb.EmbeddingModel:\n",
"def create_my_model() -> seb.SebModel:\n",
" hf_name = model_name\n",
"\n",
" # create meta data\n",
Expand All @@ -331,8 +351,8 @@
" languages=[],\n",
" embedding_size=384,\n",
" )\n",
" return seb.EmbeddingModel(\n",
" loader=get_my_model,\n",
" return seb.SebModel(\n",
" encoder=MyEncoder(),\n",
" meta=meta,\n",
" )"
]
Expand Down
1 change: 1 addition & 0 deletions makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ pr:
make lint
make static-type-check
make test
python src/scripts/check_benchmark_is_up_to_date.py
@echo "Ready to make a PR"

update-table-in-docs:
Expand Down
1 change: 1 addition & 0 deletions src/seb/cache/all-MiniLM-L6-v2/DKHate.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"DKHate","task_description":"Danish Tweets annotated for Hate Speech either being Offensive or not","task_version":"1.1.1","time_of_run":"2024-02-17T14:15:56.990723","scores":{"da":{"accuracy":0.5504559270516718,"f1":0.4487544754943351,"ap":0.1339388090920717,"accuracy_stderr":0.08179003177509295,"f1_stderr":0.04439449341359171,"ap_stderr":0.011425193008238908,"main_score":0.5504559270516718}},"main_score":"accuracy"}
1 change: 1 addition & 0 deletions src/seb/cache/all-MiniLM-L6-v2/LCC.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"LCC","task_description":"The leipzig corpora collection, annotated for sentiment","task_version":"1.1.1","time_of_run":"2024-02-17T14:27:16.594009","scores":{"da":{"accuracy":0.3846666666666666,"f1":0.3650136884557438,"accuracy_stderr":0.03664241622309678,"f1_stderr":0.03540233062350939,"main_score":0.3846666666666666}},"main_score":"accuracy"}
1 change: 1 addition & 0 deletions src/seb/cache/all-MiniLM-L6-v2/ScaLA.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"task_name":"ScaLA","task_description":"A linguistic acceptability task for Danish, Norwegian Bokmål Norwegian Nynorsk and Swedish.","task_version":"1.1.1","time_of_run":"2024-02-17T14:16:48.189570","scores":{"da":{"accuracy":0.500341796875,"f1":0.49707836995043886,"ap":0.5002094933987072,"accuracy_stderr":0.00619193763738963,"f1_stderr":0.0056844458165261,"ap_stderr":0.003086975974967957,"main_score":0.500341796875},"nb":{"accuracy":0.501708984375,"f1":0.4975764910012903,"ap":0.5008866627584277,"accuracy_stderr":0.005376639537486631,"f1_stderr":0.0057738834379346465,"ap_stderr":0.0027086373278729326,"main_score":0.501708984375},"sv":{"accuracy":0.4994140625,"f1":0.49683291347295633,"ap":0.49972805126929065,"accuracy_stderr":0.004620905532937882,"f1_stderr":0.004218877367923389,"ap_stderr":0.002309284713600242,"main_score":0.4994140625},"nn":{"accuracy":0.500341796875,"f1":0.4961039513494653,"ap":0.5002008066815936,"accuracy_stderr":0.005393463714180493,"f1_stderr":0.007401911092391309,"ap_stderr":0.002724109383421351,"main_score":0.500341796875}},"main_score":"accuracy"}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"task_name":"LCC","task_description":"The leipzig corpora collection, annotated for sentiment","task_version":"1.1.1","time_of_run":"2024-02-17T13:29:21.544827","scores":{"da":{"accuracy":0.3846666666666666,"f1":0.3650136884557438,"accuracy_stderr":0.03664241622309678,"f1_stderr":0.03540233062350939,"main_score":0.3846666666666666}},"main_score":"accuracy"}
{"task_name":"LCC","task_description":"The leipzig corpora collection, annotated for sentiment","task_version":"1.1.1","time_of_run":"2024-02-17T13:50:54.460803","scores":{"da":{"accuracy":0.3846666666666666,"f1":0.3650136884557438,"accuracy_stderr":0.03664241622309678,"f1_stderr":0.03540233062350939,"main_score":0.3846666666666666}},"main_score":"accuracy"}
6 changes: 2 additions & 4 deletions src/seb/cli/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,8 @@

from radicli import Arg, get_list_converter

from seb.registered_models.hf_models import get_sentence_transformer

import seb
from seb.registered_models.hf_models import SentenceTransformerWithTaskEncode
from seb.registered_models.sentence_transformer_models import SentenceTransformerWithTaskEncode, wrap_sentence_transformer
from seb.registries import get_all_models

from .cli import cli
Expand All @@ -32,7 +30,7 @@ def build_model(model_name: str) -> seb.SebModel:
)
model = seb.SebModel(
meta=meta,
encoder=seb.LazyLoadEncoder(partial(get_sentence_transformer, model_name=model_name)), # type: ignore
encoder=seb.LazyLoadEncoder(partial(wrap_sentence_transformer, model_name=model_name)), # type: ignore
)
return model

Expand Down
46 changes: 37 additions & 9 deletions src/seb/interfaces/model.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import json
import logging
from dataclasses import dataclass
from datetime import date
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Optional, Protocol, runtime_checkable

from numpy.typing import ArrayLike
import numpy as np
import torch
from pydantic import BaseModel

from seb.interfaces.language import Language
Expand All @@ -13,6 +15,9 @@
from .task import Task


logger = logging.getLogger(__name__)


@runtime_checkable
class Encoder(Protocol):
"""
Expand All @@ -26,7 +31,7 @@ def encode(
task: Optional["Task"] = None,
batch_size: int = 32,
**kwargs: Any,
) -> ArrayLike:
) -> np.ndarray:
"""Returns a list of embeddings for the given sentences.
Args:
Expand All @@ -41,6 +46,16 @@ def encode(
"""
...

# The following methods are optional and can be implemented if the model supports them.
# def to(self, device: torch.device):
# ...

# def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray:
# ...

# def encode_corpus(self, corpus: list[dict[str, str]], **kwargs: Any) -> np.ndarray:
# ...


class ModelMeta(BaseModel):
"""
Expand All @@ -54,7 +69,7 @@ class ModelMeta(BaseModel):
languages: list[Language] = []
open_source: bool = False
embedding_size: Optional[int] = None
model_type: Optional[str] = None
model_architecture: Optional[str] = None
release_date: Optional[date] = None

def get_path_name(self) -> str:
Expand Down Expand Up @@ -89,22 +104,35 @@ class LazyLoadEncoder(Encoder):
loader: Callable[[], Encoder]
_model: Optional[Encoder] = None

def load_model(self):
"""
Load the model.
"""
if self._model is None:
self._model = self.loader()

def to(self, device: torch.device):
self.load_model()
try:
self._model = self._model.to(device) # type: ignore
except AttributeError:
logging.debug(f"Model {self._model} does not have a to method")

@property
def model(self) -> Encoder:
"""
Dynimically load the model.
"""
if self._model is None:
self._model = self.loader()
return self._model
self.load_model()
return self._model # type: ignore

def encode(
self,
sentences: list[str],
*,
task: Optional["Task"] = None,
**kwargs: Any,
) -> ArrayLike:
) -> np.ndarray:
"""
Returns a list of embeddings for the given sentences.
Args:
Expand All @@ -119,13 +147,13 @@ def encode(
"""
return self.model.encode(sentences, task=task, **kwargs)

def encode_queries(self, queries: list[str], **kwargs: Any) -> ArrayLike:
def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray:
try:
return self.model.encode_queries(queries, **kwargs) # type: ignore
except AttributeError:
return self.encode(queries, task=None, **kwargs)

def encode_corpus(self, corpus: list[dict[str, str]], **kwargs: Any) -> ArrayLike:
def encode_corpus(self, corpus: list[dict[str, str]], **kwargs: Any) -> np.ndarray:
try:
return self.model.encode_corpus(corpus, **kwargs) # type: ignore
except AttributeError:
Expand Down
4 changes: 2 additions & 2 deletions src/seb/registered_models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from .e5_mistral import *
from .e5_models import *
from .fairseq_models import *
from .fasttext import *
from .hf_models import *
from .fasttext_models import *
from .sentence_transformer_models import *
from .openai_models import *
from .translate_e5_models import *
16 changes: 8 additions & 8 deletions src/seb/registered_models/cohere_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from functools import partial
from typing import Any, Optional

import numpy as np
import torch

from seb.interfaces.model import Encoder, LazyLoadEncoder, ModelMeta, SebModel
Expand Down Expand Up @@ -40,31 +41,30 @@ def _embed(self, sentences: list[str], input_type: str) -> torch.Tensor:
def encode(
self,
sentences: list[str],
batch_size: int = 32, # noqa: ARG002
*,
task: Optional[Task] = None,
**kwargs: Any, # noqa: ARG002
) -> torch.Tensor:
) -> np.ndarray:
if task and task.task_type == "Classification":
input_type = "classification"
elif task and task.task_type == "Clustering":
input_type = "clustering"
else:
input_type = "search_document"
return self._embed(sentences, input_type=input_type)
return self._embed(sentences, input_type=input_type).numpy()

def encode_queries(self, queries: list[str], batch_size: int, **kwargs): # noqa
return self._embed(queries, input_type="search_query")
def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray: # noqa: ARG002
return self._embed(queries, input_type="search_query").numpy()

def encode_corpus(self, corpus: list[dict[str, str]], batch_size: int, **kwargs): # noqa
def encode_corpus(self, corpus: list[dict[str, str]], **kwargs: Any) -> np.ndarray: # noqa: ARG002
if isinstance(corpus, dict):
sentences = [
(corpus["title"][i] + self.sep + corpus["text"][i]).strip() if "title" in corpus else corpus["text"][i].strip() # type: ignore
for i in range(len(corpus["text"])) # type: ignore
]
else:
sentences = [(doc["title"] + self.sep + doc["text"]).strip() if "title" in doc else doc["text"].strip() for doc in corpus]
return self._embed(sentences, input_type="search_document")
return self._embed(sentences, input_type="search_document").numpy()


@models.register("embed-multilingual-v3.0")
Expand All @@ -77,7 +77,7 @@ def create_embed_multilingual_v3() -> SebModel:
languages=[],
open_source=False,
embedding_size=1024,
model_type="API",
model_architecture="API",
release_date=date(2023, 11, 2),
)
return SebModel(
Expand Down
20 changes: 8 additions & 12 deletions src/seb/registered_models/e5_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,16 @@
from itertools import islice
from typing import Any, Literal, Optional, TypeVar

import numpy as np
import torch
import torch.nn.functional as F
from numpy.typing import ArrayLike
from torch import Tensor
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer, BatchEncoding

from seb.interfaces.model import Encoder, LazyLoadEncoder, ModelMeta, SebModel
from seb.interfaces.task import Task
from seb.registries import models

from tqdm import tqdm

import logging

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -146,7 +142,7 @@ def encode(
batch_size: int = 32,
encode_type: EncodeTypes = "query",
**kwargs: Any, # noqa
) -> ArrayLike:
) -> np.ndarray:
if batch_size > self.max_batch_size:
batch_size = self.max_batch_size
batched_embeddings = []
Expand All @@ -164,9 +160,9 @@ def encode(
)
batched_embeddings.append(embeddings.detach().cpu())

return torch.cat(batched_embeddings).to("cpu")
return torch.cat(batched_embeddings).to("cpu").detach().numpy()

def encode_corpus(self, corpus: list[dict[str, str]], **kwargs: Any) -> ArrayLike:
def encode_corpus(self, corpus: list[dict[str, str]], **kwargs: Any) -> np.ndarray:
sep = " "
if isinstance(corpus, dict):
sentences = [
Expand All @@ -177,11 +173,11 @@ def encode_corpus(self, corpus: list[dict[str, str]], **kwargs: Any) -> ArrayLik
sentences = [(doc["title"] + sep + doc["text"]).strip() if "title" in doc else doc["text"].strip() for doc in corpus]
return self.encode(sentences, encode_type="passage", **kwargs)

def encode_queries(self, queries: list[str], **kwargs: Any) -> ArrayLike:
def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray:
return self.encode(queries, encode_type="query", **kwargs)


@models.register("intfloat/e5-mistral-7b-instruct")
@models.register("e5-mistral-7b-instruct")
def create_multilingual_e5_mistral_7b_instruct() -> SebModel:
hf_name = "intfloat/e5-mistral-7b-instruct"
meta = ModelMeta(
Expand All @@ -191,7 +187,7 @@ def create_multilingual_e5_mistral_7b_instruct() -> SebModel:
languages=[],
open_source=True,
embedding_size=4096,
model_type="Mistral",
model_architecture="Mistral",
release_date=date(2023, 12, 20),
)
return SebModel(
Expand Down
Loading

0 comments on commit 83fd962

Please sign in to comment.