Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Language Models from Transformers Lib #5187

Merged
merged 24 commits into from
Feb 12, 2020
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
5a71c57
first implementation ready.
dakshvar22 Feb 4, 2020
d5a1b85
tested all available models. implementation works
dakshvar22 Feb 4, 2020
205c7bd
refactored class name
dakshvar22 Feb 4, 2020
7eb475c
remove print statement
dakshvar22 Feb 4, 2020
cc55dfc
Apply suggestions from code review
dakshvar22 Feb 4, 2020
576d8f4
quick review comments. Tests WIP
dakshvar22 Feb 4, 2020
fab9122
Merge branch 'transformers_lm' of github.com:RasaHQ/rasa into transfo…
dakshvar22 Feb 4, 2020
4c3f218
fix imports
dakshvar22 Feb 4, 2020
49e9a15
bug fix to swap seq and sentence embeddings
dakshvar22 Feb 5, 2020
e990e8f
tests for tokenizers are in
dakshvar22 Feb 5, 2020
99bad36
added featurizer tests
dakshvar22 Feb 5, 2020
01c7de5
added documentation
dakshvar22 Feb 5, 2020
3259788
add changelog, move common method out of class
dakshvar22 Feb 5, 2020
d10d73f
refactor spacy doc name
dakshvar22 Feb 5, 2020
452368d
Apply suggestions from code review
dakshvar22 Feb 6, 2020
38b6a01
added new components to test pipelines
dakshvar22 Feb 6, 2020
7c654fd
Merge branch 'transformers_lm' of github.com:RasaHQ/rasa into transfo…
dakshvar22 Feb 6, 2020
7ccafc3
created new pipeline for failing tests
dakshvar22 Feb 9, 2020
6d9c886
separate pipeline for convert as well
dakshvar22 Feb 10, 2020
93fe116
merge tf2, resolve conflicts and resolve comments
dakshvar22 Feb 12, 2020
b88ed3e
refactored variable names
dakshvar22 Feb 12, 2020
4bfb4ed
removed unnecessary component from a test pipeline
dakshvar22 Feb 12, 2020
684836a
Merge branch 'tf2' into transformers_lm
dakshvar22 Feb 12, 2020
86ee337
added constants
dakshvar22 Feb 12, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions rasa/nlu/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@
}

SPACY_DOCS = {TEXT_ATTRIBUTE: "spacy_doc", RESPONSE_ATTRIBUTE: "response_spacy_doc"}
TRANSFORMERS_DOCS = {
TEXT_ATTRIBUTE: "text_transformers_doc",
RESPONSE_ATTRIBUTE: "response_transformers_doc",
}

DENSE_FEATURIZABLE_ATTRIBUTES = [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]

Expand Down
64 changes: 64 additions & 0 deletions rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import numpy as np
import typing
from typing import Any, Optional, Text

from rasa.nlu.config import RasaNLUModelConfig
from rasa.nlu.featurizers.featurizer import Featurizer
from rasa.nlu.training_data import Message, TrainingData

if typing.TYPE_CHECKING:
from spacy.tokens import Doc

from rasa.nlu.constants import (
TEXT_ATTRIBUTE,
TRANSFORMERS_DOCS,
DENSE_FEATURE_NAMES,
DENSE_FEATURIZABLE_ATTRIBUTES,
TOKENS_NAMES,
)


class LanguageModelFeaturizer(Featurizer):

provides = [
DENSE_FEATURE_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
]

requires = [
TRANSFORMERS_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
] + [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]

def train(
self,
training_data: TrainingData,
config: Optional[RasaNLUModelConfig],
**kwargs: Any,
) -> None:

for example in training_data.intent_examples:
for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
self._set_lm_features(example, attribute)

def get_doc(self, message: Message, attribute: Text) -> Any:

return message.get(TRANSFORMERS_DOCS[attribute])

def process(self, message: Message, **kwargs: Any) -> None:

self._set_lm_features(message)

def _set_lm_features(self, message: Message, attribute: Text = TEXT_ATTRIBUTE):
"""Adds the precomputed word vectors to the messages features."""

message_attribute_doc = self.get_doc(message, attribute)

if message_attribute_doc is not None:
sequence_features = message_attribute_doc["sequence_features"]
sentence_features = message_attribute_doc["sentence_features"]

features = np.concatenate([sequence_features, sentence_features])

features = self._combine_with_existing_dense_features(
message, features, DENSE_FEATURE_NAMES[attribute]
)
message.set(DENSE_FEATURE_NAMES[attribute], features)
6 changes: 6 additions & 0 deletions rasa/nlu/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
CountVectorsFeaturizer,
)
from rasa.nlu.featurizers.dense_featurizer.lm_featurizer import LanguageModelFeaturizer
from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
from rasa.nlu.model import Metadata
from rasa.nlu.selectors.embedding_response_selector import ResponseSelector
Expand All @@ -36,8 +37,10 @@
from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer
from rasa.nlu.utils.mitie_utils import MitieNLP
from rasa.nlu.utils.spacy_utils import SpacyNLP
from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
from rasa.utils.common import class_from_module_path, raise_warning

if typing.TYPE_CHECKING:
Expand All @@ -53,12 +56,14 @@
# utils
SpacyNLP,
MitieNLP,
HFTransformersNLP,
# tokenizers
MitieTokenizer,
SpacyTokenizer,
WhitespaceTokenizer,
ConveRTTokenizer,
JiebaTokenizer,
LanguageModelTokenizer,
# extractors
SpacyEntityExtractor,
MitieEntityExtractor,
Expand All @@ -72,6 +77,7 @@
LexicalSyntacticFeaturizer,
CountVectorsFeaturizer,
ConveRTFeaturizer,
LanguageModelFeaturizer,
# classifiers
SklearnIntentClassifier,
MitieIntentClassifier,
Expand Down
35 changes: 35 additions & 0 deletions rasa/nlu/tokenizers/lm_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import typing
from typing import Text, List, Any, Dict

from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
from rasa.nlu.training_data import Message

from rasa.nlu.constants import (
TOKENS_NAMES,
TRANSFORMERS_DOCS,
DENSE_FEATURIZABLE_ATTRIBUTES,
)


class LanguageModelTokenizer(Tokenizer):

provides = [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]

requires = [
TRANSFORMERS_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
]

defaults = {
# Flag to check whether to split intents
"intent_tokenization_flag": False,
# Symbol on which intent should be split
"intent_split_symbol": "_",
}

def get_doc(self, message: Message, attribute: Text) -> Dict[Text, Any]:
return message.get(TRANSFORMERS_DOCS[attribute])

def tokenize(self, message: Message, attribute: Text) -> List[Token]:
doc = self.get_doc(message, attribute)

return doc["tokens"]
Empty file.
Loading