Skip to content

Commit

Permalink
Update semanlink dataset and provide sentence similarityt models wrap…
Browse files Browse the repository at this point in the history
…per.
  • Loading branch information
raphaelsty committed Jan 25, 2022
1 parent 23fe00b commit 817423c
Show file tree
Hide file tree
Showing 19 changed files with 106,662 additions and 11,332 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ __pycache__/
*.pickle
*.icloud
models/
api/
api/
*__pycache__
4 changes: 2 additions & 2 deletions ckb/__version__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
VERSION = (0, 0, 2)
VERSION = (0, 0, 3)

__version__ = '.'.join(map(str, VERSION))
__version__ = ".".join(map(str, VERSION))
53 changes: 37 additions & 16 deletions ckb/datasets/semanlink.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
import os
import json
import pathlib

import pandas as pd

from mkb import datasets as mkb_datasets

from ..utils import read_csv


__all__ = ["Semanlink"]


class Semanlink(mkb_datasets.Dataset):
"""Semanlink dataset.
Train triplets gather entities created before 2019-06-01.
Valid triplets gather entities created between 2019-06-01 and 2020-06-01.
Test triplets gather entities created between 2020-06-01 and 2021-10-27.
Parameters
----------
batch_size (int): Size of the batch.
shuffle (bool): Whether to shuffle the dataset or not.
use_labels (bool):
shuffle (bool): Replaces the identifier of the entities with their textual label.
pre_compute (bool): Pre-compute parameters such as weights when using translationnal model
(TransE, DistMult, RotatE, pRotatE, ComplEx).
num_workers (int): Number of workers dedicated to iterate on the dataset.
Expand All @@ -38,32 +40,51 @@ class Semanlink(mkb_datasets.Dataset):
>>> from ckb import datasets
>>> dataset = datasets.Semanlink(batch_size=1, pre_compute=True, shuffle=True, seed=42)
>>> dataset = datasets.Semanlink(batch_size=1, pre_compute=False, shuffle=True, seed=42)
>>> dataset
Semanlink dataset
Batch size 1
Entities 5454
Relations 4
Shuffle True
Train triples 6422
Validation triples 803
Test triples 803
Entities 32502
Relations 40
Shuffle True
Train triples 73828
Validation triples 5035
Test triples 6094
"""

def __init__(
self, batch_size, shuffle=True, pre_compute=True, num_workers=1, seed=None
self,
batch_size,
use_labels=True,
shuffle=True,
pre_compute=True,
num_workers=1,
seed=None,
):

self.filename = "semanlink"

path = pathlib.Path(__file__).parent.joinpath(self.filename)

if use_labels:
with open(f"{path}/labels.json", "r") as entities_labels:
labels = json.load(entities_labels)

train = read_csv(path=f"{path}/train.csv", sep="|")
valid = read_csv(path=f"{path}/valid.csv", sep="|")
test = read_csv(path=f"{path}/test.csv", sep="|")

if use_labels:
train = [(labels.get(h, h), r, labels.get(t, t)) for h, r, t in train]
valid = [(labels.get(h, h), r, labels.get(t, t)) for h, r, t in valid]
test = [(labels.get(h, h), r, labels.get(t, t)) for h, r, t in test]

super().__init__(
train=read_csv(path=f"{path}/train.csv", sep="|"),
valid=read_csv(path=f"{path}/valid.csv", sep="|"),
test=read_csv(path=f"{path}/test.csv", sep="|"),
train=train,
valid=valid,
test=test,
classification=False,
pre_compute=pre_compute,
batch_size=batch_size,
Expand Down
11,905 changes: 11,905 additions & 0 deletions ckb/datasets/semanlink/labels.json

Large diffs are not rendered by default.

3,158 changes: 0 additions & 3,158 deletions ckb/datasets/semanlink/questions.csv

This file was deleted.

9,434 changes: 8,631 additions & 803 deletions ckb/datasets/semanlink/test.csv

Large diffs are not rendered by default.

85,169 changes: 78,747 additions & 6,422 deletions ckb/datasets/semanlink/train.csv

Large diffs are not rendered by default.

7,915 changes: 7,112 additions & 803 deletions ckb/datasets/semanlink/valid.csv

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions ckb/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from .base import BaseModel
from .distill_bert import DistillBert
from .flaubert import FlauBERT
from .similarity import Similarity
from .transformer import Transformer

__all__ = [
"BaseModel",
"DistillBert",
"FlauBERT",
"Similarity",
"Transformer",
]
162 changes: 162 additions & 0 deletions ckb/models/similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
__all__ = ["Similarity"]

import torch

from ..scoring import TransE
from .base import BaseModel


class Similarity(BaseModel):
"""Sentence Similarity models wrapper.
Parameters
----------
gamma (int): A higher gamma parameter increases the upper and lower bounds of the latent
space and vice-versa.
entities (dict): Mapping between entities id and entities label.
relations (dict): Mapping between relations id and entities label.
Examples
--------
>>> from ckb import models
>>> from ckb import datasets
>>> from transformers import AutoTokenizer, AutoModel
>>> import torch
>>> tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
>>> model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')
>>> _ = torch.manual_seed(42)
>>> dataset = datasets.Semanlink(1, pre_compute=False)
>>> model = models.Similarity(
... model = model,
... tokenizer = tokenizer,
... entities = dataset.entities,
... relations = dataset.relations,
... gamma = 9,
... device = 'cpu',
... )
>>> sample = torch.tensor([[0, 0, 0], [2, 2, 2]])
>>> model(sample)
tensor([[3.5273],
[3.6367]], grad_fn=<ViewBackward>)
>>> sample = torch.tensor([[0, 0, 1], [2, 2, 1]])
>>> model(sample)
tensor([[-78.3936],
[-79.7217]], grad_fn=<ViewBackward>)
>>> sample = torch.tensor([[1, 0, 0], [1, 2, 2]])
>>> model(sample)
tensor([[-78.1690],
[-80.2369]], grad_fn=<ViewBackward>)
>>> sample = torch.tensor([[0, 0, 0], [2, 2, 2]])
>>> negative_sample = torch.tensor([[0], [2]])
>>> model(sample, negative_sample, mode='head-batch')
tensor([[3.5273],
[3.6367]], grad_fn=<ViewBackward>)
>>> model(sample, negative_sample, mode='tail-batch')
tensor([[3.5273],
[3.6367]], grad_fn=<ViewBackward>)
References
----------
1. [Sentence Similarity models](https://huggingface.co/models?pipeline_tag=sentence-similarity&sort=downloads)
"""

def __init__(
self,
model,
tokenizer,
entities,
relations,
scoring=TransE(),
hidden_dim=None,
gamma=9,
device="cuda",
):

if hidden_dim is None:
hidden_dim = 768
init_l2 = False
else:
init_l2 = True

super(Similarity, self).__init__(
hidden_dim=hidden_dim,
entities=entities,
relations=relations,
scoring=scoring,
gamma=gamma,
)

self.tokenizer = tokenizer
self.model = model
self.max_length = list(self.tokenizer.max_model_input_sizes.values())[0]
self.device = device

if init_l2:
self.l2 = torch.nn.Linear(768, hidden_dim)
else:
self.l2 = None

def encoder(self, e):
"""Encode input entities descriptions.
Parameters:
e (list): List of description of entities.
Returns:
Torch tensor of encoded entities.
"""
inputs = self.tokenizer.batch_encode_plus(
e,
add_special_tokens=True,
truncation=True,
max_length=self.max_length,
padding="max_length",
return_token_type_ids=True,
return_tensors="pt",
)

output = self.model(
input_ids=torch.tensor(inputs["input_ids"]).to(self.device),
attention_mask=torch.tensor(inputs["attention_mask"]).to(self.device),
)

sentence_embeddings = self.mean_pooling(
output=output, attention_mask=inputs["attention_mask"]
)

if self.l2 is not None:
sentence_embeddings = self.l2(sentence_embeddings)

return sentence_embeddings

@staticmethod
def mean_pooling(output, attention_mask):
"""Mean pooling.
References
----------
1. [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)
"""
token_embeddings = (
output.last_hidden_state
) # First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
input_mask_expanded.sum(1), min=1e-9
)
2 changes: 1 addition & 1 deletion ckb/models/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def encoder(self, e):
add_special_tokens=True,
truncation=True,
max_length=self.max_length,
padding="max_length",
padding="longest",
return_token_type_ids=True,
)

Expand Down
2 changes: 2 additions & 0 deletions docs/api/datasets/Semanlink.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ Semanlink dataset.

- **batch_size**

- **use_labels** – defaults to `True`

- **shuffle** – defaults to `True`

- **pre_compute** – defaults to `True`
Expand Down
14 changes: 1 addition & 13 deletions docs/api/evaluation/Evaluation.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,7 @@ Wrapper for MKB evaluation module.

- **device** – defaults to `cuda`

- **num_workers** – defaults to `1`

- **entities_to_drop** – defaults to `[]`

- **same_entities** – defaults to `{}`
- **num_workers** – defaults to `0`



Expand Down Expand Up @@ -149,14 +145,6 @@ DistillBert model

- **model**

???- note "solve_same_entities"

Replace artificial entities by the target. Some description may be dedicated to the same entities.

**Parameters**

- **argsort**

???- note "types_relations"

Divide input dataset relations into different categories (i.e. ONE-TO-ONE, ONE-TO-MANY, MANY-TO-ONE and MANY-TO-MANY) according to the mapping properties of relationships.
Expand Down
Loading

0 comments on commit 817423c

Please sign in to comment.