Update semanlink dataset and provide sentence similarityt models wrap…

…per.
raphaelsty · Jan 25, 2022 · 817423c · 817423c
1 parent 23fe00b
commit 817423c
Show file tree

Hide file tree

Showing 19 changed files with 106,662 additions and 11,332 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,4 +10,5 @@ __pycache__/
 *.pickle
 *.icloud
 models/
-api/
+api/
+*__pycache__
diff --git a/ckb/__version__.py b/ckb/__version__.py
@@ -1,3 +1,3 @@
-VERSION = (0, 0, 2)
+VERSION = (0, 0, 3)
 
-__version__ = '.'.join(map(str, VERSION))
+__version__ = ".".join(map(str, VERSION))
diff --git a/ckb/datasets/semanlink.py b/ckb/datasets/semanlink.py
@@ -1,23 +1,25 @@
-import os
+import json
 import pathlib
 
-import pandas as pd
-
 from mkb import datasets as mkb_datasets
 
 from ..utils import read_csv
 
-
 __all__ = ["Semanlink"]
 
 
 class Semanlink(mkb_datasets.Dataset):
     """Semanlink dataset.
 
+    Train triplets gather entities created before 2019-06-01.
+    Valid triplets gather entities created between 2019-06-01 and 2020-06-01.
+    Test triplets gather entities created between 2020-06-01 and 2021-10-27.
+
     Parameters
     ----------
         batch_size (int): Size of the batch.
-        shuffle (bool): Whether to shuffle the dataset or not.
+        use_labels (bool):
+        shuffle (bool): Replaces the identifier of the entities with their textual label.
         pre_compute (bool): Pre-compute parameters such as weights when using translationnal model
             (TransE, DistMult, RotatE, pRotatE, ComplEx).
         num_workers (int): Number of workers dedicated to iterate on the dataset.
@@ -38,32 +40,51 @@ class Semanlink(mkb_datasets.Dataset):
 
         >>> from ckb import datasets
 
-        >>> dataset = datasets.Semanlink(batch_size=1, pre_compute=True, shuffle=True, seed=42)
+        >>> dataset = datasets.Semanlink(batch_size=1, pre_compute=False, shuffle=True, seed=42)
 
         >>> dataset
         Semanlink dataset
             Batch size  1
-            Entities  5454
-            Relations  4
-            Shuffle  True
-            Train triples  6422
-            Validation triples  803
-            Test triples  803
+              Entities  32502
+             Relations  40
+               Shuffle  True
+         Train triples  73828
+         Validation triples  5035
+         Test triples  6094
 
     """
 
     def __init__(
-        self, batch_size, shuffle=True, pre_compute=True, num_workers=1, seed=None
+        self,
+        batch_size,
+        use_labels=True,
+        shuffle=True,
+        pre_compute=True,
+        num_workers=1,
+        seed=None,
     ):
 
         self.filename = "semanlink"
 
         path = pathlib.Path(__file__).parent.joinpath(self.filename)
 
+        if use_labels:
+            with open(f"{path}/labels.json", "r") as entities_labels:
+                labels = json.load(entities_labels)
+
+        train = read_csv(path=f"{path}/train.csv", sep="|")
+        valid = read_csv(path=f"{path}/valid.csv", sep="|")
+        test = read_csv(path=f"{path}/test.csv", sep="|")
+
+        if use_labels:
+            train = [(labels.get(h, h), r, labels.get(t, t)) for h, r, t in train]
+            valid = [(labels.get(h, h), r, labels.get(t, t)) for h, r, t in valid]
+            test = [(labels.get(h, h), r, labels.get(t, t)) for h, r, t in test]
+
         super().__init__(
-            train=read_csv(path=f"{path}/train.csv", sep="|"),
-            valid=read_csv(path=f"{path}/valid.csv", sep="|"),
-            test=read_csv(path=f"{path}/test.csv", sep="|"),
+            train=train,
+            valid=valid,
+            test=test,
             classification=False,
             pre_compute=pre_compute,
             batch_size=batch_size,

diff --git a/ckb/datasets/semanlink/labels.json b/ckb/datasets/semanlink/labels.json
diff --git a/ckb/datasets/semanlink/questions.csv b/ckb/datasets/semanlink/questions.csv
diff --git a/ckb/datasets/semanlink/test.csv b/ckb/datasets/semanlink/test.csv
diff --git a/ckb/datasets/semanlink/train.csv b/ckb/datasets/semanlink/train.csv
diff --git a/ckb/datasets/semanlink/valid.csv b/ckb/datasets/semanlink/valid.csv
diff --git a/ckb/models/__init__.py b/ckb/models/__init__.py
@@ -1,11 +1,13 @@
 from .base import BaseModel
 from .distill_bert import DistillBert
 from .flaubert import FlauBERT
+from .similarity import Similarity
 from .transformer import Transformer
 
 __all__ = [
     "BaseModel",
     "DistillBert",
     "FlauBERT",
+    "Similarity",
     "Transformer",
 ]
diff --git a/ckb/models/similarity.py b/ckb/models/similarity.py
@@ -0,0 +1,162 @@
+__all__ = ["Similarity"]
+
+import torch
+
+from ..scoring import TransE
+from .base import BaseModel
+
+
+class Similarity(BaseModel):
+    """Sentence Similarity models wrapper.
+
+    Parameters
+    ----------
+        gamma (int): A higher gamma parameter increases the upper and lower bounds of the latent
+            space and vice-versa.
+        entities (dict): Mapping between entities id and entities label.
+        relations (dict): Mapping between relations id and entities label.
+
+    Examples
+    --------
+
+    >>> from ckb import models
+    >>> from ckb import datasets
+
+    >>> from transformers import AutoTokenizer, AutoModel
+
+    >>> import torch
+
+    >>> tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
+
+    >>> model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')
+
+    >>> _ = torch.manual_seed(42)
+
+    >>> dataset = datasets.Semanlink(1, pre_compute=False)
+
+    >>> model = models.Similarity(
+    ...    model = model,
+    ...    tokenizer = tokenizer,
+    ...    entities = dataset.entities,
+    ...    relations = dataset.relations,
+    ...    gamma = 9,
+    ...    device = 'cpu',
+    ... )
+
+    >>> sample = torch.tensor([[0, 0, 0], [2, 2, 2]])
+    >>> model(sample)
+    tensor([[3.5273],
+            [3.6367]], grad_fn=<ViewBackward>)
+
+    >>> sample = torch.tensor([[0, 0, 1], [2, 2, 1]])
+    >>> model(sample)
+    tensor([[-78.3936],
+            [-79.7217]], grad_fn=<ViewBackward>)
+
+
+    >>> sample = torch.tensor([[1, 0, 0], [1, 2, 2]])
+    >>> model(sample)
+    tensor([[-78.1690],
+            [-80.2369]], grad_fn=<ViewBackward>)
+
+    >>> sample = torch.tensor([[0, 0, 0], [2, 2, 2]])
+    >>> negative_sample = torch.tensor([[0], [2]])
+
+    >>> model(sample, negative_sample, mode='head-batch')
+    tensor([[3.5273],
+            [3.6367]], grad_fn=<ViewBackward>)
+
+    >>> model(sample, negative_sample, mode='tail-batch')
+    tensor([[3.5273],
+            [3.6367]], grad_fn=<ViewBackward>)
+
+    References
+    ----------
+    1. [Sentence Similarity models](https://huggingface.co/models?pipeline_tag=sentence-similarity&sort=downloads)
+
+    """
+
+    def __init__(
+        self,
+        model,
+        tokenizer,
+        entities,
+        relations,
+        scoring=TransE(),
+        hidden_dim=None,
+        gamma=9,
+        device="cuda",
+    ):
+
+        if hidden_dim is None:
+            hidden_dim = 768
+            init_l2 = False
+        else:
+            init_l2 = True
+
+        super(Similarity, self).__init__(
+            hidden_dim=hidden_dim,
+            entities=entities,
+            relations=relations,
+            scoring=scoring,
+            gamma=gamma,
+        )
+
+        self.tokenizer = tokenizer
+        self.model = model
+        self.max_length = list(self.tokenizer.max_model_input_sizes.values())[0]
+        self.device = device
+
+        if init_l2:
+            self.l2 = torch.nn.Linear(768, hidden_dim)
+        else:
+            self.l2 = None
+
+    def encoder(self, e):
+        """Encode input entities descriptions.
+
+        Parameters:
+            e (list): List of description of entities.
+
+        Returns:
+            Torch tensor of encoded entities.
+        """
+        inputs = self.tokenizer.batch_encode_plus(
+            e,
+            add_special_tokens=True,
+            truncation=True,
+            max_length=self.max_length,
+            padding="max_length",
+            return_token_type_ids=True,
+            return_tensors="pt",
+        )
+
+        output = self.model(
+            input_ids=torch.tensor(inputs["input_ids"]).to(self.device),
+            attention_mask=torch.tensor(inputs["attention_mask"]).to(self.device),
+        )
+
+        sentence_embeddings = self.mean_pooling(
+            output=output, attention_mask=inputs["attention_mask"]
+        )
+
+        if self.l2 is not None:
+            sentence_embeddings = self.l2(sentence_embeddings)
+
+        return sentence_embeddings
+
+    @staticmethod
+    def mean_pooling(output, attention_mask):
+        """Mean pooling.
+
+        References
+        ----------
+        1. [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)
+        """
+        token_embeddings = (
+            output.last_hidden_state
+        )  # First element of model_output contains all token embeddings
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
+            input_mask_expanded.sum(1), min=1e-9
+        )
diff --git a/ckb/models/transformer.py b/ckb/models/transformer.py
@@ -117,7 +117,7 @@ def encoder(self, e):
             add_special_tokens=True,
             truncation=True,
             max_length=self.max_length,
-            padding="max_length",
+            padding="longest",
             return_token_type_ids=True,
         )
 

diff --git a/docs/api/datasets/Semanlink.md b/docs/api/datasets/Semanlink.md
@@ -8,6 +8,8 @@ Semanlink dataset.
 
 - **batch_size**
 
+- **use_labels** – defaults to `True`
+
 - **shuffle** – defaults to `True`
 
 - **pre_compute** – defaults to `True`

diff --git a/docs/api/evaluation/Evaluation.md b/docs/api/evaluation/Evaluation.md
@@ -16,11 +16,7 @@ Wrapper for MKB evaluation module.
 
 - **device** – defaults to `cuda`
 
-- **num_workers** – defaults to `1`
-
-- **entities_to_drop** – defaults to `[]`
-
-- **same_entities** – defaults to `{}`
+- **num_workers** – defaults to `0`
 
 
 
@@ -149,14 +145,6 @@ DistillBert model
 
     - **model**    
 
-???- note "solve_same_entities"
-
-    Replace artificial entities by the target. Some description may be dedicated to the same entities.
-
-    **Parameters**
-
-    - **argsort**    
-
 ???- note "types_relations"
 
     Divide input dataset relations into different categories (i.e. ONE-TO-ONE, ONE-TO-MANY, MANY-TO-ONE and MANY-TO-MANY) according to the mapping properties of relationships.