diff --git a/alt_requirements/requirements_full.txt b/alt_requirements/requirements_full.txt
index e033e2544462..b97e60736d04 100644
--- a/alt_requirements/requirements_full.txt
+++ b/alt_requirements/requirements_full.txt
@@ -10,4 +10,7 @@
 # ConveRT Requirements
 -r requirements_pretrained_embeddings_convert.txt
 
+# Transformers Requirements
+-r requirements_pretrained_embeddings_transformers.txt
+
 jieba==0.39
diff --git a/alt_requirements/requirements_pretrained_embeddings_convert.txt b/alt_requirements/requirements_pretrained_embeddings_convert.txt
index 265cbaeee044..7a96d5bba9e3 100644
--- a/alt_requirements/requirements_pretrained_embeddings_convert.txt
+++ b/alt_requirements/requirements_pretrained_embeddings_convert.txt
@@ -1,5 +1,5 @@
 # Minimum Install Requirements
 -r ../requirements.txt
 
-tensorflow_text==1.15.1
-tensorflow_hub==0.6.0
+tensorflow_text==2.1.0rc0
+tensorflow_hub==0.7.0
diff --git a/alt_requirements/requirements_pretrained_embeddings_transformers.txt b/alt_requirements/requirements_pretrained_embeddings_transformers.txt
new file mode 100644
index 000000000000..a513258cbe48
--- /dev/null
+++ b/alt_requirements/requirements_pretrained_embeddings_transformers.txt
@@ -0,0 +1,4 @@
+# Minimum Install Requirements
+-r ../requirements.txt
+
+transformers==2.3.0
\ No newline at end of file
diff --git a/changelog/4817.improvement.rst b/changelog/4817.improvement.rst
index 71fb53e94139..7c310bc073e1 100644
--- a/changelog/4817.improvement.rst
+++ b/changelog/4817.improvement.rst
@@ -1,2 +1,4 @@
 Part of Slack sanitization: 
-Multiple garbled URL's in a string coming from slack will be converted into actual strings. ``Example: health check of <http://eemdb.net|eemdb.net> and <http://eemdb1.net|eemdb1.net> to health check of eemdb.net and eemdb1.net``
+Multiple garbled URL's in a string coming from slack will be converted into actual strings.
+``Example: health check of <http://eemdb.net|eemdb.net> and <http://eemdb1.net|eemdb1.net> to health check of
+eemdb.net and eemdb1.net``
diff --git a/changelog/5065.feature.rst b/changelog/5065.feature.rst
new file mode 100644
index 000000000000..4360dee5c431
--- /dev/null
+++ b/changelog/5065.feature.rst
@@ -0,0 +1,5 @@
+Add :ref:`LexicalSyntacticFeaturizer` to sparse featurizers.
+
+``LexicalSyntacticFeaturizer`` does the same featurization as the ``CRFEntityExtractor``. We extracted the
+featurization into a separate component so that the features can be reused and featurization is independent from the
+entity extraction.
\ No newline at end of file
diff --git a/changelog/5187.feature.rst b/changelog/5187.feature.rst
new file mode 100644
index 000000000000..c18bc7efdfdd
--- /dev/null
+++ b/changelog/5187.feature.rst
@@ -0,0 +1,7 @@
+Integrate language models from HuggingFace's `Transformers <https://github.com/huggingface/transformers>`_ Library.
+
+Add a new NLP component :ref:`HFTransformersNLP` which tokenizes and featurizes incoming messages using a specified
+pre-trained model with the Transformers library as the backend.
+Add :ref:`LanguageModelTokenizer` and :ref:`LanguageModelFeaturizer` which use the information from
+:ref:`HFTransformersNLP` and sets them correctly for message object.
+Language models currently supported: BERT, OpenAIGPT, GPT-2, XLNet, DistilBert, RoBERTa.
diff --git a/changelog/5230.feature.rst b/changelog/5230.feature.rst
new file mode 100644
index 000000000000..8b1c3769fcc6
--- /dev/null
+++ b/changelog/5230.feature.rst
@@ -0,0 +1,15 @@
+Refactor how GPU and CPU environments are configured for TensorFlow 2.0.
+
+Please refer to the :ref:`documentation <tensorflow_usage>` to understand
+which environment variables to set in what scenarios. A couple of examples are shown below as well:
+
+.. code-block:: python
+
+    # This specifies to use 1024 MB of memory from GPU with logical ID 0 and 2048 MB of memory from GPU with logical ID 1
+    TF_GPU_MEMORY_ALLOC="0:1024, 1:2048"
+
+    # Specifies that at most 3 CPU threads can be used to parallelize multiple non-blocking operations
+    TF_INTER_OP_PARALLELISM_THREADS="3"
+
+    # Specifies that at most 2 CPU threads can be used to parallelize a particular operation.
+    TF_INTRA_OP_PARALLELISM_THREADS="2"
diff --git a/changelog/5266.feature.rst b/changelog/5266.feature.rst
new file mode 100644
index 000000000000..e876d7d0d51d
--- /dev/null
+++ b/changelog/5266.feature.rst
@@ -0,0 +1,14 @@
+Added a new NLU component :ref:`DIETClassifier <diet-classifier>` and a new policy :ref:`TEDPolicy <ted-policy>`.
+
+DIET (Dual Intent and Entity Transformer) is a multi-task architecture for intent classification and entity
+recognition. You can read more about this component in our :ref:`documentation <diet-classifier>`.
+The new component will replace the :ref:`EmbeddingIntentClassifier <embedding-intent-classifier>` and the
+:ref:`CRFEntityExtractor` in the future.
+Those two components are deprecated from now on.
+See :ref:`migration guide <migration-to-rasa-1.8>` for details on how to
+switch to the new component.
+
+:ref:`TEDPolicy <ted-policy>` is the new name for :ref:`EmbeddingPolicy <embedding_policy>`.
+``EmbeddingPolicy`` is deprecated from now on.
+The functionality of ``TEDPolicy`` and ``EmbeddingPolicy`` is the same.
+Please update your configuration file to use the new name for the policy.
diff --git a/changelog/5266.improvement.rst b/changelog/5266.improvement.rst
new file mode 100644
index 000000000000..492b1afd5bcf
--- /dev/null
+++ b/changelog/5266.improvement.rst
@@ -0,0 +1,22 @@
+We updated our code to TensorFlow 2.
+
+We added a new docker image for ConveRT.
+The new images uses the following configuration
+
+```
+language: "en"
+
+pipeline:
+  - name: ConveRTTokenizer
+  - name: ConveRTFeaturizer
+  - name: RegexFeaturizer
+  - name: LexicalSyntacticFeaturizer
+  - name: CountVectorsFeaturizer
+  - name: CountVectorsFeaturizer
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+  - name: EntitySynonymMapper
+  - name: ResponseSelector
+```
\ No newline at end of file
diff --git a/changelog/5266.misc.rst b/changelog/5266.misc.rst
new file mode 100644
index 000000000000..acd65f814249
--- /dev/null
+++ b/changelog/5266.misc.rst
@@ -0,0 +1,9 @@
+We deprecated all existing NLU pipeline templates, ``SklearnIntentClassifier`` and ``KerasPolicy``.
+
+Please list the components you want to use directly in your configuration file.
+Check out :ref:`Choosing a Pipeline <choosing-a-pipeline>` to decide what components to
+include in your pipeline.
+
+Use ``DIETClassifier`` instead of ``SklearnIntentClassifier``.
+
+Use ``TEDPolicy`` instead of ``KerasPolicy``.
\ No newline at end of file
diff --git a/changelog/5266.removal.rst b/changelog/5266.removal.rst
new file mode 100644
index 000000000000..f8b4270bbcf8
--- /dev/null
+++ b/changelog/5266.removal.rst
@@ -0,0 +1,2 @@
+Properties ``Component.provides`` and ``Component.requires`` are deprecated.
+Use ``Component.required_components()`` instead.
\ No newline at end of file
diff --git a/changelog/663.feature.rst b/changelog/663.feature.rst
new file mode 100644
index 000000000000..9d218cf1280e
--- /dev/null
+++ b/changelog/663.feature.rst
@@ -0,0 +1,6 @@
+The sentence vector of the ``SpacyFeaturizer`` and ``MitieFeaturizer`` can be calculated using max or mean pooling.
+
+To specify the pooling operation, set the option ``pooling`` for the ``SpacyFeaturizer`` or the ``MitieFeaturizer``
+in your configuration file. The default pooling operation is ``mean``. The mean pooling operation also does not take
+into account words, that do not have a word vector.
+See our :ref:`documentation <components>` for more details.
diff --git a/changelog/699.misc.rst b/changelog/699.misc.rst
index 5677d8bc05aa..e834f0d25d54 100644
--- a/changelog/699.misc.rst
+++ b/changelog/699.misc.rst
@@ -1,6 +1,3 @@
-The `EmbeddingPolicy <https://rasa.com/docs/rasa/core/policies/#embedding-policy>`_
-replaces the ``KerasPolicy`` in new Rasa projects generated with ``rasa init``.
-The `EmbeddingPolicy <https://rasa.com/docs/rasa/core/policies/#embedding-policy>`_
-is now the recommended machine learning policy. Please see the
-`migration guide <https://rasa.com/docs/rasa/migration-guide/#rasa-1-7-to-rasa-1-8>`_
-if you want to switch to this new policy in an existing project.
+The :ref:`TEDPolicy <ted_policy>` replaces the ``KerasPolicy`` in new Rasa projects generated with ``rasa init``.
+The :ref:`TEDPolicy <ted_policy>` is now the recommended machine learning policy. Please see the
+:ref:`migration guide <migration-to-rasa-1.8>` if you want to switch to this new policy in an existing project.
diff --git a/data/configs_for_docs/default_config.yml b/data/configs_for_docs/default_config.yml
new file mode 100644
index 000000000000..46b75c8078c7
--- /dev/null
+++ b/data/configs_for_docs/default_config.yml
@@ -0,0 +1,14 @@
+language: "en"
+
+pipeline:
+  - name: WhitespaceTokenizer
+  - name: RegexFeaturizer
+  - name: LexicalSyntacticFeaturizer
+  - name: CountVectorsFeaturizer
+  - name: CountVectorsFeaturizer
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+  - name: EntitySynonymMapper
+  - name: ResponseSelector
\ No newline at end of file
diff --git a/data/configs_for_docs/default_english_config.yml b/data/configs_for_docs/default_english_config.yml
new file mode 100644
index 000000000000..ee0da9bfab1d
--- /dev/null
+++ b/data/configs_for_docs/default_english_config.yml
@@ -0,0 +1,15 @@
+language: "en"
+
+pipeline:
+  - name: ConveRTTokenizer
+  - name: ConveRTFeaturizer
+  - name: RegexFeaturizer
+  - name: LexicalSyntacticFeaturizer
+  - name: CountVectorsFeaturizer
+  - name: CountVectorsFeaturizer
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+  - name: EntitySynonymMapper
+  - name: ResponseSelector
diff --git a/data/configs_for_docs/default_spacy_config.yml b/data/configs_for_docs/default_spacy_config.yml
new file mode 100644
index 000000000000..360a2eef2c92
--- /dev/null
+++ b/data/configs_for_docs/default_spacy_config.yml
@@ -0,0 +1,17 @@
+language: "en"
+
+pipeline:
+  - name: SpacyNLP
+  - name: SpacyTokenizer
+  - name: SpacyFeaturizer
+  - name: RegexFeaturizer
+  - name: LexicalSyntacticFeaturizer
+  - name: CountVectorsFeaturizer
+  - name: CountVectorsFeaturizer
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+  - name: EntitySynonymMapper
+  - name: ResponseSelector
+
diff --git a/sample_configs/config_pretrained_embeddings_convert.yml b/data/configs_for_docs/pretrained_embeddings_convert_config_1.yml
similarity index 100%
rename from sample_configs/config_pretrained_embeddings_convert.yml
rename to data/configs_for_docs/pretrained_embeddings_convert_config_1.yml
diff --git a/data/configs_for_docs/pretrained_embeddings_convert_config_2.yml b/data/configs_for_docs/pretrained_embeddings_convert_config_2.yml
new file mode 100644
index 000000000000..58d393dc5d36
--- /dev/null
+++ b/data/configs_for_docs/pretrained_embeddings_convert_config_2.yml
@@ -0,0 +1,6 @@
+language: "en"
+
+pipeline:
+- name: "ConveRTTokenizer"
+- name: "ConveRTFeaturizer"
+- name: "EmbeddingIntentClassifier"
diff --git a/sample_configs/config_pretrained_embeddings_mitie.yml b/data/configs_for_docs/pretrained_embeddings_mitie_config_1.yml
similarity index 100%
rename from sample_configs/config_pretrained_embeddings_mitie.yml
rename to data/configs_for_docs/pretrained_embeddings_mitie_config_1.yml
diff --git a/sample_configs/config_pretrained_embeddings_mitie_2.yml b/data/configs_for_docs/pretrained_embeddings_mitie_config_2.yml
similarity index 100%
rename from sample_configs/config_pretrained_embeddings_mitie_2.yml
rename to data/configs_for_docs/pretrained_embeddings_mitie_config_2.yml
diff --git a/sample_configs/config_pretrained_embeddings_spacy.yml b/data/configs_for_docs/pretrained_embeddings_spacy_config_1.yml
similarity index 100%
rename from sample_configs/config_pretrained_embeddings_spacy.yml
rename to data/configs_for_docs/pretrained_embeddings_spacy_config_1.yml
diff --git a/data/configs_for_docs/pretrained_embeddings_spacy_config_2.yml b/data/configs_for_docs/pretrained_embeddings_spacy_config_2.yml
new file mode 100644
index 000000000000..14aca60c5a69
--- /dev/null
+++ b/data/configs_for_docs/pretrained_embeddings_spacy_config_2.yml
@@ -0,0 +1,11 @@
+language: "en"
+
+pipeline:
+- name: "SpacyNLP"
+- name: "SpacyTokenizer"
+- name: "SpacyFeaturizer"
+- name: "RegexFeaturizer"
+- name: "CRFEntityExtractor"
+- name: "EntitySynonymMapper"
+
+- name: "SklearnIntentClassifier"
diff --git a/sample_configs/config_supervised_embeddings.yml b/data/configs_for_docs/supervised_embeddings_config_1.yml
similarity index 100%
rename from sample_configs/config_supervised_embeddings.yml
rename to data/configs_for_docs/supervised_embeddings_config_1.yml
diff --git a/data/configs_for_docs/supervised_embeddings_config_2.yml b/data/configs_for_docs/supervised_embeddings_config_2.yml
new file mode 100644
index 000000000000..c1a776269dae
--- /dev/null
+++ b/data/configs_for_docs/supervised_embeddings_config_2.yml
@@ -0,0 +1,13 @@
+language: "en"
+
+pipeline:
+- name: "WhitespaceTokenizer"
+- name: "RegexFeaturizer"
+- name: "CRFEntityExtractor"
+- name: "EntitySynonymMapper"
+- name: "CountVectorsFeaturizer"
+- name: "CountVectorsFeaturizer"
+  analyzer: "char_wb"
+  min_ngram: 1
+  max_ngram: 4
+- name: "EmbeddingIntentClassifier"
diff --git a/data/test/config_embedding_test.yml b/data/test/config_embedding_test.yml
index 379e1e2e3ef7..1c9eb116fb09 100644
--- a/data/test/config_embedding_test.yml
+++ b/data/test/config_embedding_test.yml
@@ -2,5 +2,5 @@ language: en
 pipeline:
 - name: "CountVectorsFeaturizer"
   max_ngram: 3
-- name: "EmbeddingIntentClassifier"
-  epochs: 10
\ No newline at end of file
+- name: "DIETClassifier"
+  epochs: 10
diff --git a/sample_configs/config_crf_custom_features.yml b/data/test_config/config_crf_custom_features.yml
similarity index 100%
rename from sample_configs/config_crf_custom_features.yml
rename to data/test_config/config_crf_custom_features.yml
diff --git a/sample_configs/config_defaults.yml b/data/test_config/config_defaults.yml
similarity index 100%
rename from sample_configs/config_defaults.yml
rename to data/test_config/config_defaults.yml
diff --git a/sample_configs/config_embedding_intent_response_selector.yml b/data/test_config/config_embedding_intent_response_selector.yml
similarity index 79%
rename from sample_configs/config_embedding_intent_response_selector.yml
rename to data/test_config/config_embedding_intent_response_selector.yml
index 705cfca7ff51..b2be5582ade5 100644
--- a/sample_configs/config_embedding_intent_response_selector.yml
+++ b/data/test_config/config_embedding_intent_response_selector.yml
@@ -3,7 +3,7 @@ language: "en"
 pipeline:
   - name: "WhitespaceTokenizer"
   - name: "CountVectorsFeaturizer"
-  - name: "EmbeddingIntentClassifier"
+  - name: "DIETClassifier"
     epochs: 2
   - name: "ResponseSelector"
     epochs: 2
diff --git a/data/test_config/config_pretrained_embeddings_convert.yml b/data/test_config/config_pretrained_embeddings_convert.yml
new file mode 100644
index 000000000000..992e56a5e186
--- /dev/null
+++ b/data/test_config/config_pretrained_embeddings_convert.yml
@@ -0,0 +1,3 @@
+language: "en"
+
+pipeline: "pretrained_embeddings_convert"
diff --git a/data/test_config/config_pretrained_embeddings_mitie.yml b/data/test_config/config_pretrained_embeddings_mitie.yml
new file mode 100644
index 000000000000..d1b8b86dd953
--- /dev/null
+++ b/data/test_config/config_pretrained_embeddings_mitie.yml
@@ -0,0 +1,11 @@
+language: "en"
+
+pipeline:
+- name: "MitieNLP"
+  model: "data/total_word_feature_extractor.dat"
+- name: "MitieTokenizer"
+- name: "MitieEntityExtractor"
+- name: "EntitySynonymMapper"
+- name: "RegexFeaturizer"
+- name: "MitieFeaturizer"
+- name: "SklearnIntentClassifier"
diff --git a/data/test_config/config_pretrained_embeddings_mitie_2.yml b/data/test_config/config_pretrained_embeddings_mitie_2.yml
new file mode 100644
index 000000000000..356eb898e812
--- /dev/null
+++ b/data/test_config/config_pretrained_embeddings_mitie_2.yml
@@ -0,0 +1,10 @@
+language: "en"
+
+pipeline:
+- name: "MitieNLP"
+  model: "data/total_word_feature_extractor.dat"
+- name: "MitieTokenizer"
+- name: "MitieEntityExtractor"
+- name: "EntitySynonymMapper"
+- name: "RegexFeaturizer"
+- name: "MitieIntentClassifier"
diff --git a/sample_configs/config_pretrained_embeddings_mitie_zh.yml b/data/test_config/config_pretrained_embeddings_mitie_zh.yml
similarity index 100%
rename from sample_configs/config_pretrained_embeddings_mitie_zh.yml
rename to data/test_config/config_pretrained_embeddings_mitie_zh.yml
diff --git a/data/test_config/config_pretrained_embeddings_spacy.yml b/data/test_config/config_pretrained_embeddings_spacy.yml
new file mode 100644
index 000000000000..3516519cd529
--- /dev/null
+++ b/data/test_config/config_pretrained_embeddings_spacy.yml
@@ -0,0 +1,3 @@
+language: "en"
+
+pipeline: "pretrained_embeddings_spacy"
diff --git a/sample_configs/config_pretrained_embeddings_spacy_de.yml b/data/test_config/config_pretrained_embeddings_spacy_de.yml
similarity index 100%
rename from sample_configs/config_pretrained_embeddings_spacy_de.yml
rename to data/test_config/config_pretrained_embeddings_spacy_de.yml
diff --git a/sample_configs/config_pretrained_embeddings_spacy_duckling.yml b/data/test_config/config_pretrained_embeddings_spacy_duckling.yml
similarity index 100%
rename from sample_configs/config_pretrained_embeddings_spacy_duckling.yml
rename to data/test_config/config_pretrained_embeddings_spacy_duckling.yml
diff --git a/data/test_config/config_supervised_embeddings.yml b/data/test_config/config_supervised_embeddings.yml
new file mode 100644
index 000000000000..3d965f6147d4
--- /dev/null
+++ b/data/test_config/config_supervised_embeddings.yml
@@ -0,0 +1,3 @@
+language: "en"
+
+pipeline: "supervised_embeddings"
diff --git a/sample_configs/config_supervised_embeddings_duckling.yml b/data/test_config/config_supervised_embeddings_duckling.yml
similarity index 92%
rename from sample_configs/config_supervised_embeddings_duckling.yml
rename to data/test_config/config_supervised_embeddings_duckling.yml
index c1771ea5addc..7b0635d8a124 100644
--- a/sample_configs/config_supervised_embeddings_duckling.yml
+++ b/data/test_config/config_supervised_embeddings_duckling.yml
@@ -3,5 +3,6 @@ language: "en"
 pipeline:
 - name: "CountVectorsFeaturizer"
 - name: "EmbeddingIntentClassifier"
+  epochs: 2
 - name: "DucklingHTTPExtractor"
   url: "http://duckling:8000"
diff --git a/sample_configs/config_train_server_json.yml b/data/test_config/config_train_server_json.yml
similarity index 100%
rename from sample_configs/config_train_server_json.yml
rename to data/test_config/config_train_server_json.yml
diff --git a/sample_configs/config_train_server_md.yml b/data/test_config/config_train_server_md.yml
similarity index 100%
rename from sample_configs/config_train_server_md.yml
rename to data/test_config/config_train_server_md.yml
diff --git a/data/test_config/embedding_random_seed.yaml b/data/test_config/embedding_random_seed.yaml
index c2bd5bb86918..53ee82926eb3 100644
--- a/data/test_config/embedding_random_seed.yaml
+++ b/data/test_config/embedding_random_seed.yaml
@@ -1,3 +1,4 @@
 policies:
-- name: EmbeddingPolicy
+- name: TEDPolicy
   random_seed: 42
+  epochs: 2
diff --git a/docker/Dockerfile_full b/docker/Dockerfile_full
index 94086c350fd4..24d2147dc9d8 100644
--- a/docker/Dockerfile_full
+++ b/docker/Dockerfile_full
@@ -72,7 +72,7 @@ FROM base AS runner
 WORKDIR /app
 
 # Copy over default pipeline config
-COPY sample_configs/config_pretrained_embeddings_spacy_duckling.yml config.yml
+COPY docker/configs/config_pretrained_embeddings_spacy_en_duckling.yml config.yml
 
 # Copy over mitie model
 COPY --from=builder /app/data/total_word_feature_extractor.dat data/total_word_feature_extractor.dat
diff --git a/docker/Dockerfile_pretrained_embeddings_mitie_en b/docker/Dockerfile_pretrained_embeddings_mitie_en
index 7f0737e9404b..663986b7faf1 100644
--- a/docker/Dockerfile_pretrained_embeddings_mitie_en
+++ b/docker/Dockerfile_pretrained_embeddings_mitie_en
@@ -66,7 +66,7 @@ FROM base AS runner
 WORKDIR /app
 
 # Copy over default pipeline config
-COPY sample_configs/config_pretrained_embeddings_mitie.yml config.yml
+COPY docker/configs/config_pretrained_embeddings_mitie.yml config.yml
 
 # Copy over mitie model
 COPY --from=builder /app/data/total_word_feature_extractor.dat data/total_word_feature_extractor.dat
diff --git a/docker/Dockerfile_pretrained_embeddings_spacy_de b/docker/Dockerfile_pretrained_embeddings_spacy_de
index a95318485c66..55b2011fc991 100644
--- a/docker/Dockerfile_pretrained_embeddings_spacy_de
+++ b/docker/Dockerfile_pretrained_embeddings_spacy_de
@@ -67,7 +67,7 @@ FROM base AS runner
 WORKDIR /app
 
 # Copy over default pipeline config
-COPY sample_configs/config_pretrained_embeddings_spacy_de.yml config.yml
+COPY docker/configs/config_pretrained_embeddings_spacy_de.yml config.yml
 
 # Copy virtualenv from previous stage
 COPY --from=builder /build /build
diff --git a/docker/Dockerfile_pretrained_embeddings_spacy_en b/docker/Dockerfile_pretrained_embeddings_spacy_en
index 289c20053349..6796e1af18ed 100644
--- a/docker/Dockerfile_pretrained_embeddings_spacy_en
+++ b/docker/Dockerfile_pretrained_embeddings_spacy_en
@@ -67,7 +67,7 @@ FROM base AS runner
 WORKDIR /app
 
 # Copy over default pipeline config
-COPY sample_configs/config_pretrained_embeddings_spacy.yml config.yml
+COPY docker/configs/config_pretrained_embeddings_spacy_en.yml config.yml
 
 # Copy virtualenv from previous stage
 COPY --from=builder /build /build
diff --git a/docker/configs/config_pretrained_embeddings_mitie.yml b/docker/configs/config_pretrained_embeddings_mitie.yml
new file mode 100644
index 000000000000..1ff89972039a
--- /dev/null
+++ b/docker/configs/config_pretrained_embeddings_mitie.yml
@@ -0,0 +1,11 @@
+language: "en"
+
+pipeline:
+  - name: MitieNLP
+    model: "data/total_word_feature_extractor.dat"
+  - name: MitieTokenizer
+  - name: MitieEntityExtractor
+  - name: EntitySynonymMapper
+  - name: RegexFeaturizer
+  - name: MitieFeaturizer
+  - name: SklearnIntentClassifier
diff --git a/docker/configs/config_pretrained_embeddings_spacy_de.yml b/docker/configs/config_pretrained_embeddings_spacy_de.yml
new file mode 100644
index 000000000000..c5068fe6377e
--- /dev/null
+++ b/docker/configs/config_pretrained_embeddings_spacy_de.yml
@@ -0,0 +1,16 @@
+language: "de"
+
+pipeline:
+  - name: SpacyNLP
+  - name: SpacyTokenizer
+  - name: SpacyFeaturizer
+  - name: RegexFeaturizer
+  - name: LexicalSyntacticFeaturizer
+  - name: CountVectorsFeaturizer
+  - name: CountVectorsFeaturizer
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+  - name: EntitySynonymMapper
+  - name: ResponseSelector
diff --git a/docker/configs/config_pretrained_embeddings_spacy_en.yml b/docker/configs/config_pretrained_embeddings_spacy_en.yml
new file mode 100644
index 000000000000..b6591e42bc97
--- /dev/null
+++ b/docker/configs/config_pretrained_embeddings_spacy_en.yml
@@ -0,0 +1,16 @@
+language: "en"
+
+pipeline:
+  - name: SpacyNLP
+  - name: SpacyTokenizer
+  - name: SpacyFeaturizer
+  - name: RegexFeaturizer
+  - name: LexicalSyntacticFeaturizer
+  - name: CountVectorsFeaturizer
+  - name: CountVectorsFeaturizer
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+  - name: EntitySynonymMapper
+  - name: ResponseSelector
diff --git a/docker/configs/config_pretrained_embeddings_spacy_en_duckling.yml b/docker/configs/config_pretrained_embeddings_spacy_en_duckling.yml
new file mode 100644
index 000000000000..cd314e4485be
--- /dev/null
+++ b/docker/configs/config_pretrained_embeddings_spacy_en_duckling.yml
@@ -0,0 +1,18 @@
+language: "en"
+
+pipeline:
+  - name: SpacyNLP
+  - name: SpacyTokenizer
+  - name: SpacyFeaturizer
+  - name: RegexFeaturizer
+  - name: LexicalSyntacticFeaturizer
+  - name: CountVectorsFeaturizer
+  - name: CountVectorsFeaturizer
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+  - name: EntitySynonymMapper
+  - name: ResponseSelector
+  - name: DucklingHTTPExtractor
+    url: "http://duckling:8000"
diff --git a/docs/api/custom-nlu-components.rst b/docs/api/custom-nlu-components.rst
index 733e65d31f45..f41f61b7ba15 100644
--- a/docs/api/custom-nlu-components.rst
+++ b/docs/api/custom-nlu-components.rst
@@ -51,6 +51,8 @@ Component
 
 .. autoclass:: rasa.nlu.components.Component
 
+   .. automethod:: required_components
+
    .. automethod:: required_packages
 
    .. automethod:: create
diff --git a/docs/api/tensorflow_usage.rst b/docs/api/tensorflow_usage.rst
new file mode 100644
index 000000000000..aa9d84023844
--- /dev/null
+++ b/docs/api/tensorflow_usage.rst
@@ -0,0 +1,52 @@
+:desc: Find out how to configure your environment for efficient usage of TensorFlow inside Rasa Open Source.
+
+.. _tensorflow_usage:
+
+TensorFlow Configuration
+========================
+
+TensorFlow allows configuring options in the runtime environment via
+`TF Config submodule <https://www.tensorflow.org/api_docs/python/tf/config>`_. Rasa Open Source supports a smaller subset of these
+configuration options and makes appropriate calls to the ``tf.config`` submodule.
+This smaller subset comprises of configurations that developers frequently use with Rasa Open Source.
+All configuration options are specified using environment variables as shown in subsequent sections.
+
+Optimizing CPU Performance
+--------------------------
+
+Parallelizing One Operation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Set ``TF_INTRA_OP_PARALLELISM_THREADS`` as an environment variable to specify the maximum number of threads that can be used
+to parallelize the execution of one operation. If left unspecified, this value defaults to ``0`` which means TensorFlow should
+pick an appropriate value depending on the system configuration.
+
+Parallelizing Multiple Operations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Set ``TF_INTER_OP_PARALLELISM_THREADS`` as an environment variable to specify the maximum number of threads that can be used
+to parallelize the execution of multiple **non-blocking** operations. If left unspecified, this value defaults to ``0``
+which means TensorFlow should pick an appropriate value depending on the system configuration.
+
+Optimizing GPU Performance
+--------------------------
+
+Limiting GPU Memory Growth
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+TensorFlow by default blocks all the available GPU memory for the running process. This can be limiting if you are running
+multiple TensorFlow processes and want to distribute memory across them. To prevent this,
+set the environment variable ``TF_FORCE_GPU_ALLOW_GROWTH`` to ``True``.
+
+Restricting Absolute GPU Memory Available
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Often, a developer wants to limit the absolute amount of GPU memory that can be used by a process.
+
+For example, you may have two visible GPUs(``GPU:0`` and ``GPU:1``) and you want to allocate 1024 MB from the first GPU
+and 2048 MB from the second GPU.
+You can do so by setting an environment variable as ``TF_GPU_MEMORY_ALLOC="0:1024, 1:2048"``.
+
+Another scenario can be where you have access to 2 GPUs(``GPU:0`` and ``GPU:1``) but you would like to use only the second
+GPU.
+``TF_GPU_MEMORY_ALLOC="1:2048"`` would make 2048 MB of memory available from GPU 1.
diff --git a/docs/conf.py b/docs/conf.py
index 8b08d499c0ae..63ccfeb2bee1 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -366,6 +366,7 @@
     ("py:class", "typing.Optional"),
     ("py:class", "typing.Generator"),
     ("py:class", "typing.Iterator"),
+    ("py:class", "typing.Type"),
     ("py:class", "collections.deque"),
     ("py:class", "sanic.app.Sanic"),
     ("py:data", "typing.Any"),
diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index ab7b35b9b036..1f176b80fc00 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -148,7 +148,7 @@ expected outcome in the case of a tie. They look like this, where higher numbers
     | 4. ``FallbackPolicy`` and ``TwoStageFallbackPolicy``
     | 3. ``MemoizationPolicy`` and ``AugmentedMemoizationPolicy``
     | 2. ``MappingPolicy``
-    | 1. ``EmbeddingPolicy``, ``KerasPolicy``, and ``SklearnPolicy``
+    | 1. ``TEDPolicy``, ``EmbeddingPolicy``, ``KerasPolicy``, and ``SklearnPolicy``
 
 This priority hierarchy ensures that, for example, if there is an intent with a mapped action, but the NLU confidence is not
 above the ``nlu_threshold``, the bot will still fall back. In general, it is not recommended to have more
@@ -197,56 +197,57 @@ set the ``random_seed`` attribute of the ``KerasPolicy`` to any integer.
 Embedding Policy
 ^^^^^^^^^^^^^^^^
 
-Transformer Embedding Dialogue Policy (TEDP)
+    .. warning::
+
+        ``EmbeddingPolicy`` was renamed to ``TEDPolicy``. Please use :ref:`ted_policy` instead of ``EmbeddingPolicy``
+        in your policy configuration. The functionality of the policy stayed the same.
+
+.. _ted_policy:
 
-Transformer version of the Recurrent Embedding Dialogue Policy (REDP)
-used in our paper: `<https://arxiv.org/abs/1811.11707>`_
+TED Policy
+^^^^^^^^^^
+
+The Transformer Embedding Dialogue (TED) Policy is described in
+`our paper <https://arxiv.org/abs/1910.00486>`__.
 
 This policy has a pre-defined architecture, which comprises the
 following steps:
 
-    - concatenate user input (user intent and entities),
-      previous system action, slots and active form
-      for each time step into an input vector
-      to pre-transformer embedding layer;
+    - concatenate user input (user intent and entities), previous system actions, slots and active forms for each time
+      step into an input vector to pre-transformer embedding layer;
     - feed it to transformer;
-    - apply a dense layer to the output of the transformer
-      to get embeddings of a dialogue for each time step;
+    - apply a dense layer to the output of the transformer to get embeddings of a dialogue for each time step;
     - apply a dense layer to create embeddings for system actions for each time step;
-    - calculate the similarity between the
-      dialogue embedding and embedded system actions.
-      This step is based on the
-      `StarSpace <https://arxiv.org/abs/1709.03856>`_ idea.
+    - calculate the similarity between the dialogue embedding and embedded system actions.
+      This step is based on the `StarSpace <https://arxiv.org/abs/1709.03856>`_ idea.
 
-It is recommended to use
-``state_featurizer=LabelTokenizerSingleStateFeaturizer(...)``
+It is recommended to use ``state_featurizer=LabelTokenizerSingleStateFeaturizer(...)``
 (see :ref:`featurization_conversations` for details).
 
 **Configuration:**
 
     Configuration parameters can be passed as parameters to the
-    ``EmbeddingPolicy`` within the policy configuration file.
+    ``TEDPolicy`` within the configuration file.
 
     .. warning::
 
-        Pass an appropriate number of ``epochs`` to the ``EmbeddingPolicy``,
-        otherwise the policy will be trained only for ``1``
-        epoch.
+        Pass an appropriate number of ``epochs`` to the ``TEDPolicy``, otherwise the policy will be trained only
+        for ``1`` epoch.
 
     The algorithm also has hyper-parameters to control:
 
         - neural network's architecture:
 
-            - ``hidden_layers_sizes_b`` sets a list of hidden layers
+            - ``hidden_layers_sizes`` sets a list of hidden layers
               sizes before embedding layer for system actions, the number
-              of hidden layers is equal to the length of the list;
-            - ``transformer_size`` sets the number of units in the transfomer;
-            - ``num_transformer_layers`` sets the number of transformer layers;
-            - ``pos_encoding`` sets the type of positional encoding in transformer,
-              it should be either ``timing`` or ``emb``;
-            - ``max_seq_length`` sets maximum sequence length
-              if embedding positional encodings are used;
-            - ``num_heads`` sets the number of heads in multihead attention;
+              of hidden layers is equal to the length of the list.
+            - ``transformer_size`` sets the number of units in the transfomer.
+            - ``number_of_transformer_layers`` sets the number of transformer layers.
+            - ``maximum_sequence_length`` sets maximum sequence length.
+            - ``number_of_attention_heads`` sets the number of heads in multihead attention.
+            - ``use_key_relative_attention`` if true use key relative embeddings in attention.
+            - ``use_value_relative_attention`` if true use key relative embeddings in attention.
+            - ``max_relative_position`` sets the max position for relative embeddings.
 
         - training:
 
@@ -263,8 +264,8 @@ It is recommended to use
 
         - embedding:
 
-            - ``embed_dim`` sets the dimension of embedding space;
-            - ``num_neg`` sets the number of incorrect intent labels,
+            - ``embedding_dimension`` sets the dimension of embedding space;
+            - ``number_of_negative_examples`` sets the number of incorrect intent labels,
               the algorithm will minimize their similarity to the user
               input during training;
             - ``similarity_type`` sets the type of the similarity,
@@ -276,13 +277,13 @@ It is recommended to use
             - ``ranking_length`` defines the number of top confidences over
               which to normalize ranking results if ``loss_type: "softmax"``;
               to turn off normalization set it to 0
-            - ``mu_pos`` controls how similar the algorithm should try
+            - ``maximum_positive_similarity`` controls how similar the algorithm should try
               to make embedding vectors for correct intent labels,
               used only if ``loss_type`` is set to ``margin``;
-            - ``mu_neg`` controls maximum negative similarity for
+            - ``maximum_negative_similarity`` controls maximum negative similarity for
               incorrect intents,
               used only if ``loss_type`` is set to ``margin``;
-            - ``use_max_sim_neg`` if ``true`` the algorithm only
+            - ``use_maximum_negative_similarity`` if ``true`` the algorithm only
               minimizes maximum similarity over incorrect intent labels,
               used only if ``loss_type`` is set to ``margin``;
             - ``scale_loss`` if ``true`` the algorithm will downscale the loss
@@ -291,20 +292,21 @@ It is recommended to use
 
         - regularization:
 
-            - ``C2`` sets the scale of L2 regularization
-            - ``C_emb`` sets the scale of how important is to minimize
+            - ``regularization_constant`` sets the scale of L2 regularization.
+            - ``negative_margin_scale`` sets the scale of how important is to minimize
               the maximum similarity between embeddings of different
-              intent labels, used only if ``loss_type`` is set to ``margin``;
-            - ``droprate_a`` sets the dropout rate between
-              layers before embedding layer for user inputs;
-            - ``droprate_b`` sets the dropout rate between layers
-              before embedding layer for system actions;
+              intent labels, used only if ``loss_type`` is set to ``margin``.
+            - ``droprate_dialogue`` sets the dropout rate between
+              layers before embedding layer for user inputs.
+            - ``droprate_label`` sets the dropout rate between layers
+              before embedding layer for system actions.
+            - ``droprate_attention`` sets the dropout rate for attention.
 
         - train accuracy calculation:
 
-            - ``evaluate_every_num_epochs`` sets how often to calculate
-              train accuracy, small values may hurt performance;
-            - ``evaluate_on_num_examples`` how many examples to use for
+            - ``evaluate_every_number_of_epochs`` sets how often to calculate
+              train accuracy, small values may hurt performance.
+            - ``evaluate_on_number_of_examples`` how many examples to use for
               hold out validation set to calculate of validation accuracy,
               large values may hurt performance.
 
@@ -319,11 +321,11 @@ It is recommended to use
 
     .. warning::
 
-        If ``evaluate_on_num_examples`` is non zero, random examples will be
+        If ``evaluate_on_number_of_examples`` is non zero, random examples will be
         picked by stratified split and used as **hold out** validation set,
         so they will be excluded from training data.
         We suggest to set it to zero if data set contains a lot of unique examples
-        of dialogue turns
+        of dialogue turns.
 
     .. note::
 
@@ -332,7 +334,7 @@ It is recommended to use
 
     .. note::
 
-        For ``cosine`` similarity ``mu_pos`` and ``mu_neg`` should
+        For ``cosine`` similarity ``maximum_positive_similarity`` and ``maximum_negative_similarity`` should
         be between ``-1`` and ``1``.
 
     .. note::
@@ -344,19 +346,92 @@ It is recommended to use
         ``batch_size`` is required, pass an ``int``, e.g.
         ``"batch_size": 8``.
 
-    These parameters can be specified in the policy configuration file.
-    The default values are defined in ``EmbeddingPolicy.defaults``:
+    These parameters can be specified in the configuration file.
+    The following default values are set:
+
+    .. code-block:: yaml
 
-    .. literalinclude:: ../../rasa/core/policies/embedding_policy.py
-       :dedent: 4
-       :start-after: # default properties (DOC MARKER - don't remove)
-       :end-before: # end default properties (DOC MARKER - don't remove)
+        # ## Architecture of the used neural network
+        # Hidden layer sizes for layers before the dialogue and label embedding layers.
+        # The number of hidden layers is equal to the length of the corresponding
+        # list.
+        "hidden_layers_sizes": {"dialogue": [], "label": []}
+        # Number of units in transformer
+        "transformer_size": 128
+        # Number of transformer layers
+        "number_of_transformer_layers": 1
+        # If 'True' use key relative embeddings in attention
+        "use_key_relative_attention": False,
+        # If 'True' use key relative embeddings in attention
+        "use_value_relative_attention": False
+        # Max position for relative embeddings
+        "max_relative_position": None
+        # Max sequence length
+        "maximum_sequence_length": 256
+        # Number of attention heads in transformer
+        "number_of_attention_heads": 4
+        # ## Training parameters
+        # Initial and final batch sizes:
+        # Batch size will be linearly increased for each epoch.
+        "batch_size": [8, 32]
+        # Strategy used when creating batches.
+        # Can be either 'sequence' or 'balanced'.
+        "batch_strategy": "balanced"
+        # Number of epochs to train
+        "epochs": 1
+        # Set random seed to any 'int' to get reproducible results
+        "random_seed": None
+        # ## Parameters for embeddings
+        # Dimension size of embedding vectors
+        "embedding_dimension": 20
+        # The number of incorrect labels. The algorithm will minimize
+        # their similarity to the user input during training.
+        "number_of_negative_examples": 20
+        # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
+        "similarity_type": "auto"
+        # The type of the loss function, either 'softmax' or 'margin'.
+        "loss_type": "softmax"
+        # Number of top actions to normalize scores for loss type 'softmax'.
+        # Set to 0 to turn off normalization.
+        "ranking_length": 10
+        # Indicates how similar the algorithm should try to make embedding vectors
+        # for correct labels.
+        # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+        "maximum_positive_similarity": 0.8
+        # Maximum negative similarity for incorrect labels.
+        # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+        "maximum_negative_similarity": -0.2
+        # If 'True' the algorithm only minimizes maximum similarity over
+        # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
+        "use_maximum_negative_similarity": True
+        # Scale loss inverse proportionally to confidence of correct prediction
+        "scale_loss": True
+        # ## Regularization parameters
+        # The scale of regularization
+        "regularization_constant": 0.001
+        # The scale of how important is to minimize the maximum similarity
+        # between embeddings of different labels.
+        "negative_margin_scale": 0.8
+        # Dropout rate for embedding layers of dialogue features.
+        "drop_rate_dialogue": 0.1
+        # Dropout rate for embedding layers of label, e.g. action, features.
+        "drop_rate_label": 0.0
+        # Dropout rate for attention.
+        "drop_rate_attention": 0
+        # ## Evaluation parameters
+        # How often calculate validation accuracy.
+        # Small values may hurt performance, e.g. model accuracy.
+        "evaluate_every_number_of_epochs": 20
+        # How many examples to use for hold out validation set
+        # Large values may hurt performance, e.g. model accuracy.
+        "evaluate_on_number_of_examples": 0
 
     .. note::
 
-          Parameter ``mu_neg`` is set to a negative value to mimic
+          The parameter ``maximum_negative_similarity`` is set to a negative value to mimic
           the original starspace algorithm in the case
-          ``mu_neg = mu_pos`` and ``use_max_sim_neg = False``. See
+          ``maximum_negative_similarity = maximum_positive_similarity`` and
+          ``use_maximum_negative_similarity = False``. See
           `starspace paper <https://arxiv.org/abs/1709.03856>`_ for details.
 
 .. _mapping-policy:
diff --git a/docs/index.rst b/docs/index.rst
index 28519e449532..3f989311669b 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -94,6 +94,7 @@ Understand messages, hold conversations, and connect to messaging channels and A
    api/lock-stores
    api/training-data-importers
    api/core-featurization
+   api/tensorflow_usage
    migration-guide
    changelog
 
diff --git a/docs/migration-guide.rst b/docs/migration-guide.rst
index 3dfe3657bda3..11463a6e3395 100644
--- a/docs/migration-guide.rst
+++ b/docs/migration-guide.rst
@@ -15,26 +15,159 @@ how you can migrate from one version to another.
 
 Rasa 1.7 to Rasa 1.8
 --------------------
+.. warning::
+
+  This is a release **breaking backwards compatibility**.
+  It is not possible to load previously trained models. Please make sure to retrain a
+  model before trying to use it with this improved version.
 
 General
 ~~~~~~~
-- The :ref:`embedding_policy` replaced the :ref:`keras_policy` as recommended machine
+- The :ref:`ted_policy` replaced the :ref:`keras_policy` as recommended machine
   learning policy. New projects generated with ``rasa init`` will automatically use
   this policy. In case you want to change your existing model configuration to use the
-  :ref:`embedding_policy` add this to the ``policies`` section in your ``config.yml``
+  :ref:`ted_policy` add this to the ``policies`` section in your ``config.yml``
   and remove potentially existing ``KerasPolicy`` entries:
 
   .. code-block:: yaml
 
     policies:
-    - ... # other policies
-    - name: EmbeddingPolicy
+    # - ... other policies
+    - name: TEDPolicy
       max_history: 5
       epochs: 100
 
   The given snippet specifies default values for the parameters ``max_history`` and
-  ``epochs``. ``max_history`` is particularly important and strongly depends on your stories. Please see the docs of the :ref:`embedding_policy` if you want to
-  customize them.
+  ``epochs``. ``max_history`` is particularly important and strongly depends on your stories.
+  Please see the docs of the :ref:`ted_policy` if you want to customize them.
+
+- All pre-defined pipeline templates are deprecated. **Any templates you use will be
+  mapped to the new configuration, but the underlying architecture is the same**.
+  Take a look at :ref:`choosing-a-pipeline` to decide on what components you should use
+  in your configuration file.
+
+- The :ref:`embedding_policy` was renamed to :ref:`ted_policy`. The functionality of the policy stayed the same.
+  Please update your configuration files to use ``TEDPolicy`` instead of ``EmbeddingPolicy``.
+
+- Most of the model options for ``EmbeddingPolicy``, ``EmbeddingIntentClassifier``, and ``ResponseSelector`` got
+  renamed. Please update your configuration files using the following mapping:
+
+  =============================  =======================================================
+  Old model option               New model option
+  =============================  =======================================================
+  hidden_layers_sizes_a          dictionary "hidden_layers_sizes" with key "text"
+  hidden_layers_sizes_b          dictionary "hidden_layers_sizes" with key "label"
+  hidden_layers_sizes_pre_dial   dictionary "hidden_layers_sizes" with key "dialogue"
+  hidden_layers_sizes_bot        dictionary "hidden_layers_sizes" with key "label"
+  num_transformer_layers         number_of_transformer_layers
+  num_heads                      number_of_attention_heads
+  max_seq_length                 maximum_sequence_length
+  dense_dim                      dense_dimension
+  embed_dim                      embedding_dimension
+  num_neg                        number_of_negative_examples
+  mu_pos                         maximum_positive_similarity
+  mu_neg                         maximum_negative_similarity
+  use_max_sim_neg                use_maximum_negative_similarity
+  C2                             regularization_constant
+  C_emb                          negative_margin_scale
+  droprate_a                     droprate_dialogue
+  droprate_b                     droprate_label
+  evaluate_every_num_epochs      evaluate_every_number_of_epochs
+  evaluate_on_num_examples       evaluate_on_number_of_examples
+  =============================  =======================================================
+
+  Old configuration options will be mapped to the new names, and a warning will be thrown.
+  However, these will be deprecated in a future release.
+
+- :ref:`embedding-intent-classifier` is now deprecated and will be replaced by :ref:`DIETClassifier <diet-classifier>`
+  in the future.
+  ``DIETClassfier`` performs intent classification as well as entity recognition.
+  If you want to get the same model behaviour as the current ``EmbeddingIntentClassifier``, you can use
+  the following configuration of ``DIETClassifier``:
+
+  .. code-block:: yaml
+
+    pipeline:
+    # - ... other components
+    - name: DIETClassifier
+      intent_classification: True
+      entity_recognition: False
+      use_masked_language_model: False
+      BILOU_flag: False
+      number_of_transformer_layers: 0
+      # ... any other parameters
+
+  See :ref:`DIETClassifier <diet-classifier>` for more information about the new component.
+  Specifying ``EmbeddingIntentClassifier`` in the configuration maps to the above component definition, the
+  behaviour is unchanged from previous versions.
+
+- ``CRFEntityExtractor`` is now deprecated and will be replaced by ``DIETClassifier`` in the future. If you want to
+  get the same model behaviour as the current ``CRFEntityExtractor``, you can use the following configuration:
+
+  .. code-block:: yaml
+
+    pipeline:
+    # - ... other components
+    - name: LexicalSyntacticFeaturizer
+      features: [
+        ["low", "title", "upper"],
+        [
+          "BOS",
+          "EOS",
+          "low",
+          "prefix5",
+          "prefix2",
+          "suffix5",
+          "suffix3",
+          "suffix2",
+          "upper",
+          "title",
+          "digit",
+        ],
+        ["low", "title", "upper"],
+      ]
+    - name: DIETClassifier
+      intent_classification: False
+      entity_recognition: True
+      use_masked_language_model: False
+      number_of_transformer_layers: 0
+      # ... any other parameters
+
+  ``CRFEntityExtractor`` featurizes user messages on its own, it does not depend on any featurizer.
+  We extracted the featurization from the component into the new featurizer :ref:``LexicalSyntacticFeaturizer``. Thus,
+  in order to obtain the same results as before, you need to add this featurizer to your pipeline before the
+  :ref:``diet-classifier``.
+  Specifying ``CRFEntityExtractor`` in the configuration maps to the above component definition, the behaviour
+  is unchanged from previous versions.
+
+- If your pipeline contains ``CRFEntityExtractor`` and ``EmbeddingIntentClassifier`` you can substitute both
+  components with :ref:`DIETClassifier <diet-classifier>`. You can use the following pipeline for that:
+
+  .. code-block:: yaml
+
+    pipeline:
+    # - ... other components
+    - name: LexicalSyntacticFeaturizer
+      features: [
+        ["low", "title", "upper"],
+        [
+          "BOS",
+          "EOS",
+          "low",
+          "prefix5",
+          "prefix2",
+          "suffix5",
+          "suffix3",
+          "suffix2",
+          "upper",
+          "title",
+          "digit",
+        ],
+        ["low", "title", "upper"],
+      ]
+    - name: DIETClassifier
+      number_of_transformer_layers: 0
+      # ... any other parameters
 
 .. _migration-to-rasa-1.7:
 
diff --git a/docs/nlu/choosing-a-pipeline.rst b/docs/nlu/choosing-a-pipeline.rst
index 96932cc953fa..04d94966da81 100644
--- a/docs/nlu/choosing-a-pipeline.rst
+++ b/docs/nlu/choosing-a-pipeline.rst
@@ -1,6 +1,4 @@
-:desc: Set up a pipeline of pre-trained word vectors form GloVe or fastText
-       or fit them specifically on your dataset using the TensorFlow pipeline
-       for open source NLU.
+:desc: Set up a pipeline of components.
 
 .. _choosing-a-pipeline:
 
@@ -15,76 +13,142 @@ it on your dataset.
 .. contents::
    :local:
 
+.. note::
+    With Rasa 1.8.0 we updated some components and deprecated all existing pipeline templates.
+    However, **any of the old terminology will still behave the same way as it did before**!
+
+.. warning::
+    We deprecated all existing pipeline templates, e.g.
+    :ref:`supervised_embeddings <section_supervised_embeddings_pipeline>`,
+    :ref:`pretrained_embeddings_spacy <section_pretrained_embeddings_spacy_pipeline>` and
+    :ref:`pretrained_embeddings_convert <section_pretrained_embeddings_convert_pipeline>`. Please list any
+    components you want to use directly in the configuration file.
 
 The Short Answer
 ----------------
 
-If your training data is in english, a good starting point is using ``pretrained_embeddings_convert`` pipeline.
+If your training data is in English, a good starting point is the following pipeline:
 
-.. literalinclude:: ../../sample_configs/config_pretrained_embeddings_convert.yml
+.. literalinclude:: ../../data/configs_for_docs/default_english_config.yml
     :language: yaml
 
-In case your training data is multi-lingual and is rich with domain specific vocabulary,
-use the ``supervised_embeddings`` pipeline:
+In case your training data is in a different language than English, use the following pipeline:
 
-.. literalinclude:: ../../sample_configs/config_supervised_embeddings.yml
+.. literalinclude:: ../../data/configs_for_docs/default_config.yml
     :language: yaml
 
 
 A Longer Answer
 ---------------
 
-The three most important pipelines are ``supervised_embeddings``, ``pretrained_embeddings_convert`` and ``pretrained_embeddings_spacy``.
-The ``pretrained_embeddings_spacy`` pipeline uses pre-trained
-word vectors from either GloVe or fastText, whereas ``pretrained_embeddings_convert`` uses a pretrained sentence encoding model `ConveRT <https://github.com/PolyAI-LDN/polyai-models>`_ to
-extract vector representations of complete user utterance as a whole. On the other hand, the ``supervised_embeddings`` pipeline
-doesn't use any pre-trained word vectors or sentence vectors, but instead fits these specifically for your dataset.
-
-.. note::
-    These recommendations are highly dependent on your dataset and hence approximate. We suggest experimenting with different pipelines to train the best model.
+We recommend using following pipeline, if your training data is in English:
 
-pretrained_embeddings_spacy
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. literalinclude:: ../../data/configs_for_docs/default_english_config.yml
+    :language: yaml
 
-The advantage of ``pretrained_embeddings_spacy`` pipeline is that if you have a training example like:
-"I want to buy apples", and Rasa is asked to predict the intent for "get pears", your model
-already knows that the words "apples" and "pears" are very similar. This is especially useful
-if you don't have large enough training data.
+The pipeline contains the :ref:`ConveRTFeaturizer` that provides pre-trained word embeddings of the user utterance.
+Pre-trained word embeddings are helpful as they already encode some kind of linguistic knowledge.
+For example, if you have a sentence like "I want to buy apples" in your training data, and Rasa is asked to predict
+the intent for "get pears", your model already knows that the words "apples" and "pears" are very similar.
+This is especially useful if you don’t have enough training data.
+The advantage of the :ref:`ConveRTFeaturizer` is that it doesn't treat each word of the user message independently, but
+creates a contextual vector representation for the complete sentence.
+However, ``ConveRT`` is only available in English.
 
+If your training data is not in English, but you still want to use pre-trained word embeddings, we recommend using
+the following pipeline:
 
-pretrained_embeddings_convert
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    .. warning::
-        Since ``ConveRT`` model is trained only on an **English** corpus of conversations, this pipeline should only be used if your training data is in English language.
+.. literalinclude:: ../../data/configs_for_docs/default_spacy_config.yml
+    :language: yaml
 
+It uses the :ref:`SpacyFeaturizer` instead of the :ref:`ConveRTFeaturizer`.
+:ref:`SpacyFeaturizer` provides pre-trained word embeddings from either GloVe or fastText in many different languages
+(see :ref:`pretrained-word-vectors`).
 
-This pipeline uses `ConveRT <https://github.com/PolyAI-LDN/polyai-models>`_ model to extract vector representation of a sentence and feeds them to ``EmbeddingIntentClassifier`` for intent classification.
-The advantage of using ``pretrained_embeddings_convert`` pipeline is that it doesn't treat each word of the user message independently,
-but creates a contextual vector representation for the complete sentence. For example, if you have a training example, like:
-"can I book a car?", and Rasa is asked to predict the intent for "I need a ride from my place", since the contextual vector representation for both
-examples are already very similar, the intent classified for both is highly likely to be the same. This is also useful if you don't have
-large enough training data.
+If you don't use any pre-trained word embeddings inside your pipeline, you are not bound to a specific language
+and can train your model to be more domain specific.
+If there are no word embeddings for your language or you have very domain specific terminology,
+we recommend using the following pipeline:
 
-    .. note::
-        To use ``pretrained_embeddings_convert`` pipeline, you should install ``tensorflow-text==1.15.1`` and ``tensorflow-hub==0.6.0``. Otherwise, you can also pip install Rasa with ``pip install rasa[convert]``. Please also note that tensorflow-text is only currently supported on Linux platforms.
+.. literalinclude:: ../../data/configs_for_docs/default_config.yml
+    :language: yaml
 
-supervised_embeddings
-~~~~~~~~~~~~~~~~~~~~~
+.. note::
+    We encourage everyone to define their own pipeline by listing the names of the components you want to use.
+    You can find the details of each component in :ref:`components`.
+    If you want to use custom components in your pipeline, see :ref:`custom-nlu-components`.
 
-The advantage of the ``supervised_embeddings`` pipeline is that your word vectors will be customised
-for your domain. For example, in general English, the word "balance" is closely related to "symmetry",
-but very different to the word "cash". In a banking domain, "balance" and "cash" are closely related
-and you'd like your model to capture that. This pipeline doesn't use a language-specific model,
-so it will work with any language that you can tokenize (on whitespace or using a custom tokenizer).
+Choosing the right Components
+-----------------------------
 
-You can read more about this topic `here <https://medium.com/rasa-blog/supervised-word-vectors-from-scratch-in-rasa-nlu-6daf794efcd8>`__ .
+A pipeline usually consist of three main parts:
 
-MITIE
-~~~~~
+    1. Tokenizaion
+    2. Featurization
+    3. Entity Recognition / Intent Classification / Response Selectors
 
-You can also use MITIE as a source of word vectors in your pipeline, see :ref:`section_mitie_pipeline`. The MITIE backend performs well for small datasets, but training can take very long if you have more than a couple of hundred examples.
+Tokenization
+~~~~~~~~~~~~
+If your chosen language is whitespace-tokenized (words are separated by spaces), you
+can use the :ref:`WhitespaceTokenizer`. If this is not the case you should use a different tokenizer.
+We support a number of different :ref:`tokenizers <tokenizers>`, or you can
+create your own :ref:`custom tokenizer <custom-nlu-components>`.
 
-However, we do not recommend that you use it as mitie support is likely to be deprecated in a future release.
+.. note::
+    Some components further down the pipeline may require a specific tokenizer. You can find those requirements
+    on the individual components in :ref:`components`. If a required component is missing inside the pipeline, an
+    error will be thrown.
+
+Featurization
+~~~~~~~~~~~~~
+You need to decide whether to use components that provide pre-trained word embeddings or not.
+
+If you don't use any pre-trained word embeddings inside your pipeline, you are not bound to a specific language
+and can train your model to be more domain specific. For example, in general English, the word "balance" is closely
+related to "symmetry", but very different to the word "cash". In a banking domain, "balance" and "cash" are closely
+related and you'd like your model to capture that.
+You should only use featurizers from the category :ref:`sparse featurizers <text-featurizers>`, such as
+:ref:`CountVectorsFeaturizer`, :ref:`RegexFeaturizer` or :ref:`LexicalSyntacticFeaturizer`, if you don't want to use
+pre-trained word embeddings.
+
+The advantage of using pre-trained word embeddings in your pipeline is that if you have a training example like:
+"I want to buy apples", and Rasa is asked to predict the intent for "get pears", your model already knows that the
+words "apples" and "pears" are very similar. This is especially useful if you don't have enough training data.
+We support a few components that provide pre-trained word embeddings:
+
+1. :ref:`MitieFeaturizer`
+2. :ref:`SpacyFeaturizer`
+3. :ref:`ConveRTFeaturizer`
+4. :ref:`LanguageModelFeaturizer`
+
+If your training data is in English, we recommend using the :ref:`ConveRTFeaturizer`.
+The advantage of the :ref:`ConveRTFeaturizer` is that it doesn't treat each word of the user message independently, but
+creates a contextual vector representation for the complete sentence. For example, if you
+have a training example, like: "Can I book a car?", and Rasa is asked to predict the intent for "I need a ride from
+my place", since the contextual vector representation for both examples are already very similar, the intent classified
+for both is highly likely to be the same. This is also useful if you don't have enough training data.
+
+An alternative to :ref:`ConveRTFeaturizer` is the :ref:`LanguageModelFeaturizer` which uses pre-trained language
+models such as BERT, GPT-2, etc. to extract similar contextual vector representations for the complete sentence. See
+:ref:`HFTransformersNLP` for a full list of supported language models.
+
+If your training data is not in English you can also use a different variant of a language model which
+is pre-trained in the language specific to your training data.
+For example, there are chinese (``bert-base-chinese``) and japanese (``bert-base-japanese``) variants of the BERT model.
+A full list of different variants of
+these language models is available in the
+`official documentation of the Transformers library <https://huggingface.co/transformers/pretrained_models.html>`_.
+
+:ref:`SpacyFeaturizer` also provides word embeddings in many different languages (see :ref:`pretrained-word-vectors`),
+so you can use this as another alternative, depending on the language of your training data.
+
+Entity Recognition / Intent Classification / Response Selectors
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Depending on your data you may want to only perform intent classification, entity recognition or response selection.
+Or you might want to combine multiple of those tasks.
+We support several components for each of the tasks. All of them are listed in :ref:`components`.
+We recommend using :ref:`diet-classifier` for intent classification and entity recognition
+and :ref:`response-selector` for response selection.
 
 Comparing different pipelines for your data
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -104,7 +168,7 @@ Class imbalance
 
 Classification algorithms often do not perform well if there is a large `class imbalance`,
 for example if you have a lot of training data for some intents and very little training data for others.
-To mitigate this problem, rasa's ``supervised_embeddings`` pipeline uses a ``balanced`` batching strategy.
+To mitigate this problem, you can use a ``balanced`` batching strategy.
 This algorithm ensures that all classes are represented in every batch, or at least in
 as many subsequent batches as possible, still mimicking the fact that some classes are more frequent than others.
 Balanced batching is used by default. In order to turn it off and use a classic batching strategy include
@@ -115,22 +179,23 @@ Balanced batching is used by default. In order to turn it off and use a classic
     language: "en"
 
     pipeline:
-    - name: "CountVectorsFeaturizer"
-    - name: "EmbeddingIntentClassifier"
+    # - ... other components
+    - name: "DIETClassifier"
       batch_strategy: sequence
 
 
 Multiple Intents
 ----------------
 
-If you want to split intents into multiple labels,
-e.g. for predicting multiple intents or for modeling hierarchical intent structure,
-you can only do this with the supervised embeddings pipeline.
-To do this, use these flags in ``Whitespace Tokenizer``:
+If you want to split intents into multiple labels, e.g. for predicting multiple intents or for modeling hierarchical
+intent structure, you need to use the :ref:`diet-classifier` in your pipeline.
+You'll also need to define these flags in whichever tokenizer you are using:
 
-    - ``intent_split_symbol``: sets the delimiter string to split the intent labels. Default ``_``
+    - ``intent_tokenization_flag``: Set it to ``True``, so that intent labels are tokenized.
+    - ``intent_split_symbol``: Set it to the delimiter string that splits the intent labels. Default ``_``.
 
-`Here <https://blog.rasa.com/how-to-handle-multiple-intents-per-input-using-rasa-nlu-tensorflow-pipeline/>`__ is a tutorial on how to use multiple intents in Rasa Core and NLU.
+Read a `tutorial <https://blog.rasa.com/how-to-handle-multiple-intents-per-input-using-rasa-nlu-tensorflow-pipeline/>`__
+on how to use multiple intents in Rasa.
 
 Here's an example configuration:
 
@@ -140,23 +205,24 @@ Here's an example configuration:
 
     pipeline:
     - name: "WhitespaceTokenizer"
+      intent_tokenization_flag: True
       intent_split_symbol: "_"
     - name: "CountVectorsFeaturizer"
-    - name: "EmbeddingIntentClassifier"
+    - name: "DIETClassifier"
 
 
 Understanding the Rasa NLU Pipeline
 -----------------------------------
 
 In Rasa NLU, incoming messages are processed by a sequence of components.
-These components are executed one after another
-in a so-called processing pipeline. There are components for entity extraction, for intent classification, response selection,
+These components are executed one after another in a so-called processing pipeline.
+There are components for entity extraction, for intent classification, response selection,
 pre-processing, and others. If you want to add your own component, for example to run a spell-check or to
 do sentiment analysis, check out :ref:`custom-nlu-components`.
 
 Each component processes the input and creates an output. The output can be used by any component that comes after
 this component in the pipeline. There are components which only produce information that is used by other components
-in the pipeline and there are other components that produce ``Output`` attributes which will be returned after
+in the pipeline and there are other components that produce ``output`` attributes which will be returned after
 the processing has finished. For example, for the sentence ``"I am looking for Chinese food"`` the output is:
 
 .. code-block:: json
@@ -164,7 +230,14 @@ the processing has finished. For example, for the sentence ``"I am looking for C
     {
         "text": "I am looking for Chinese food",
         "entities": [
-            {"start": 8, "end": 15, "value": "chinese", "entity": "cuisine", "extractor": "CRFEntityExtractor", "confidence": 0.864}
+            {
+                "start": 8,
+                "end": 15,
+                "value": "chinese",
+                "entity": "cuisine",
+                "extractor": "DIETClassifier",
+                "confidence": 0.864
+            }
         ],
         "intent": {"confidence": 0.6485910906220309, "name": "restaurant_search"},
         "intent_ranking": [
@@ -173,18 +246,21 @@ the processing has finished. For example, for the sentence ``"I am looking for C
         ]
     }
 
-This is created as a combination of the results of the different components in the pre-configured pipeline ``pretrained_embeddings_spacy``.
-For example, the ``entities`` attribute is created by the ``CRFEntityExtractor`` component.
+This is created as a combination of the results of the different components in the following pipeline:
+
+.. literalinclude:: ../../data/configs_for_docs/default_config.yml
+    :language: yaml
+
+For example, the ``entities`` attribute is created by the ``DIETClassifier`` component.
 
 
 .. _section_component_lifecycle:
 
 Component Lifecycle
 -------------------
-Every component can implement several methods from the ``Component``
-base class; in a pipeline these different methods
-will be called in a specific order. Lets assume, we added the following
-pipeline to our config:
+
+Every component can implement several methods from the ``Component`` base class; in a pipeline these different methods
+will be called in a specific order. Lets assume, we added the following pipeline to our config:
 ``"pipeline": ["Component A", "Component B", "Last Component"]``.
 The image shows the call order during the training of this pipeline:
 
@@ -234,20 +310,24 @@ exactly. Instead it will return the trained synonym.
 
 .. note::
 
-    The ``confidence`` will be set by the CRF entity extractor
-    (``CRFEntityExtractor`` component). The duckling entity extractor will always return
-    ``1``. The ``SpacyEntityExtractor`` extractor does not provide this information and
-    returns ``null``.
+    The ``confidence`` will be set by the ``CRFEntityExtractor`` and ``DIETClassifier`` component. The
+    ``DucklingHTTPExtractor`` will always return ``1``. The ``SpacyEntityExtractor`` extractor does not provide this
+    information and returns ``null``.
 
 
-Pre-configured Pipelines
-------------------------
 
-A template is just a shortcut for
-a full list of components. For example, these two configurations are equivalent:
+Pipeline Templates (deprecated)
+-------------------------------
 
-.. literalinclude:: ../../sample_configs/config_pretrained_embeddings_spacy.yml
-    :language: yaml
+A template is just a shortcut for a full list of components. For example, these two configurations are equivalent:
+
+.. code-block:: yaml
+
+    language: "en"
+
+    pipeline: "pretrained_embeddings_spacy"
+
+and
 
 .. code-block:: yaml
 
@@ -262,129 +342,126 @@ a full list of components. For example, these two configurations are equivalent:
     - name: "EntitySynonymMapper"
     - name: "SklearnIntentClassifier"
 
-Below is a list of all the pre-configured pipeline templates with customization information.
+The three most important pipelines are ``supervised_embeddings``, ``pretrained_embeddings_convert`` and
+``pretrained_embeddings_spacy``.
+The ``pretrained_embeddings_spacy`` pipeline uses pre-trained word vectors from either GloVe or fastText,
+whereas ``pretrained_embeddings_convert`` uses a pretrained sentence encoding model
+`ConveRT <https://github.com/PolyAI-LDN/polyai-models>`_ to extract vector representations of complete user
+utterance as a whole. On the other hand, the ``supervised_embeddings`` pipeline doesn't use any pre-trained word
+vectors or sentence vectors, but instead fits these specifically for your dataset.
 
-.. _section_supervised_embeddings_pipeline:
-
-supervised_embeddings
-~~~~~~~~~~~~~~~~~~~~~
+.. note::
+    These recommendations are highly dependent on your dataset and hence approximate. We suggest experimenting with
+    different pipelines to train the best model.
 
-To train a Rasa model in your preferred language, define the
-``supervised_embeddings`` pipeline as your pipeline in your ``config.yml`` or other configuration file:
+.. _section_pretrained_embeddings_spacy_pipeline:
 
-.. literalinclude:: ../../sample_configs/config_supervised_embeddings.yml
-    :language: yaml
+pretrained_embeddings_spacy
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The ``supervised_embeddings`` pipeline supports any language that can be tokenized.  By default it uses whitespace
-for tokenization. You can customize the setup of this pipeline by adding or changing components. Here are the default
-components that make up the ``supervised_embeddings`` pipeline:
+The advantage of ``pretrained_embeddings_spacy`` pipeline is that if you have a training example like:
+"I want to buy apples", and Rasa is asked to predict the intent for "get pears", your model
+already knows that the words "apples" and "pears" are very similar. This is especially useful
+if you don't have enough training data.
 
-.. code-block:: yaml
+To use the ``pretrained_embeddings_spacy`` template, use the following configuration:
 
-    language: "en"
+.. literalinclude:: ../../data/configs_for_docs/pretrained_embeddings_spacy_config_1.yml
+    :language: yaml
 
-    pipeline:
-    - name: "WhitespaceTokenizer"
-    - name: "RegexFeaturizer"
-    - name: "CRFEntityExtractor"
-    - name: "EntitySynonymMapper"
-    - name: "CountVectorsFeaturizer"
-    - name: "CountVectorsFeaturizer"
-      analyzer: "char_wb"
-      min_ngram: 1
-      max_ngram: 4
-    - name: "EmbeddingIntentClassifier"
-    
-So for example, if your chosen language is not whitespace-tokenized (words are not separated by spaces), you
-can replace the ``WhitespaceTokenizer`` with your own tokenizer. We support a number of different :ref:`tokenizers <tokenizers>`,
-or you can :ref:`create your own <custom-nlu-components>`.
+See :ref:`pretrained-word-vectors` for more information about loading spacy language models.
+To use the components and configure them separately:
 
-The pipeline uses two instances of ``CountVectorsFeaturizer``. The first one 
-featurizes text based on words. The second one featurizes text based on character 
-n-grams, preserving word boundaries. We empirically found the second featurizer 
-to be more powerful, but we decided to keep the first featurizer as well to make
-featurization more robust.
+.. literalinclude:: ../../data/configs_for_docs/pretrained_embeddings_spacy_config_2.yml
+    :language: yaml
 
 .. _section_pretrained_embeddings_convert_pipeline:
 
 pretrained_embeddings_convert
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+    .. note::
+        Since ``ConveRT`` model is trained only on an **English** corpus of conversations, this pipeline should only
+        be used if your training data is in English language.
+
+This pipeline uses the `ConveRT <https://github.com/PolyAI-LDN/polyai-models>`_ model to extract a vector representation of
+a sentence and feeds them to the ``EmbeddingIntentClassifier`` for intent classification.
+The advantage of using the ``pretrained_embeddings_convert`` pipeline is that it doesn't treat each word of the user
+message independently, but creates a contextual vector representation for the complete sentence. For example, if you
+have a training example, like: "can I book a car?", and Rasa is asked to predict the intent for "I need a ride from
+my place", since the contextual vector representation for both examples are already very similar, the intent classified
+for both is highly likely to be the same. This is also useful if you don't have enough training data.
+
+    .. note::
+        To use ``pretrained_embeddings_convert`` pipeline, you should install Rasa with ``pip install rasa[convert]``.
+        Please also note that one of the dependencies(``tensorflow-text``) is currently only supported on Linux
+        platforms.
+
 To use the ``pretrained_embeddings_convert`` template:
 
-.. literalinclude:: ../../sample_configs/config_pretrained_embeddings_convert.yml
+.. literalinclude:: ../../data/configs_for_docs/pretrained_embeddings_convert_config_2.yml
     :language: yaml
 
 To use the components and configure them separately:
 
-.. code-block:: yaml
+.. literalinclude:: ../../data/configs_for_docs/pretrained_embeddings_convert_config_2.yml
+    :language: yaml
 
-    language: "en"
+.. _section_supervised_embeddings_pipeline:
 
-    pipeline:
-    - name: "ConveRTTokenizer"
-    - name: "ConveRTFeaturizer"
-    - name: "EmbeddingIntentClassifier"
+supervised_embeddings
+~~~~~~~~~~~~~~~~~~~~~
 
-.. _section_pretrained_embeddings_spacy_pipeline:
+The advantage of the ``supervised_embeddings`` pipeline is that your word vectors will be customised
+for your domain. For example, in general English, the word "balance" is closely related to "symmetry",
+but very different to the word "cash". In a banking domain, "balance" and "cash" are closely related
+and you'd like your model to capture that. This pipeline doesn't use a language-specific model,
+so it will work with any language that you can tokenize (on whitespace or using a custom tokenizer).
 
-pretrained_embeddings_spacy
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+You can read more about this topic `in this blog post <https://medium.com/rasa-blog/supervised-word-vectors-from-scratch-in-rasa-nlu-6daf794efcd8>`__ .
 
-To use the ``pretrained_embeddings_spacy`` template:
+To train a Rasa model in your preferred language, define the
+``supervised_embeddings`` pipeline as your pipeline in your ``config.yml`` or other configuration file:
 
-.. literalinclude:: ../../sample_configs/config_pretrained_embeddings_spacy.yml
+.. literalinclude:: ../../data/configs_for_docs/supervised_embeddings_config_1.yml
     :language: yaml
 
-See :ref:`pretrained-word-vectors` for more information about loading spacy language models.
-To use the components and configure them separately:
+The ``supervised_embeddings`` pipeline supports any language that can be whitespace tokenized. By default it uses
+whitespace for tokenization. You can customize the setup of this pipeline by adding or changing components. Here are
+the default components that make up the ``supervised_embeddings`` pipeline:
 
-.. code-block:: yaml
+.. literalinclude:: ../../data/configs_for_docs/supervised_embeddings_config_2.yml
+    :language: yaml
 
-    language: "en"
+So for example, if your chosen language is not whitespace-tokenized (words are not separated by spaces), you
+can replace the ``WhitespaceTokenizer`` with your own tokenizer. We support a number of different :ref:`tokenizers <tokenizers>`,
+or you can :ref:`create your own <custom-nlu-components>`.
 
-    pipeline:
-    - name: "SpacyNLP"
-    - name: "SpacyTokenizer"
-    - name: "SpacyFeaturizer"
-    - name: "RegexFeaturizer"
-    - name: "CRFEntityExtractor"
-    - name: "EntitySynonymMapper"
-    - name: "SklearnIntentClassifier"
+The pipeline uses two instances of ``CountVectorsFeaturizer``. The first one
+featurizes text based on words. The second one featurizes text based on character
+n-grams, preserving word boundaries. We empirically found the second featurizer
+to be more powerful, but we decided to keep the first featurizer as well to make
+featurization more robust.
 
 .. _section_mitie_pipeline:
 
 MITIE
 ~~~~~
 
+You can also use MITIE as a source of word vectors in your pipeline.
+The MITIE backend performs well for small datasets, but training can take very long if you have more than a couple
+of hundred examples.
+
+However, we do not recommend that you use it as mitie support is likely to be deprecated in a future release.
+
 To use the MITIE pipeline, you will have to train word vectors from a corpus. Instructions can be found
 :ref:`here <mitie>`. This will give you the file path to pass to the ``model`` parameter.
 
-.. literalinclude:: ../../sample_configs/config_pretrained_embeddings_mitie.yml
+.. literalinclude:: ../../data/configs_for_docs/pretrained_embeddings_mitie_config_1.yml
     :language: yaml
 
 Another version of this pipeline uses MITIE's featurizer and also its multi-class classifier.
 Training can be quite slow, so this is not recommended for large datasets.
 
-.. literalinclude:: ../../sample_configs/config_pretrained_embeddings_mitie_2.yml
+.. literalinclude:: ../../data/configs_for_docs/pretrained_embeddings_mitie_config_2.yml
     :language: yaml
-
-
-Custom pipelines
-----------------
-
-You don't have to use a template, you can also run a fully custom pipeline
-by listing the names of the components you want to use:
-
-.. code-block:: yaml
-
-    pipeline:
-    - name: "SpacyNLP"
-    - name: "CRFEntityExtractor"
-    - name: "EntitySynonymMapper"
-
-This creates a pipeline that only does entity recognition, but no
-intent classification. So Rasa NLU will not predict any intents.
-You can find the details of each component in :ref:`components`.
-
-If you want to use custom components in your pipeline, see :ref:`custom-nlu-components`.
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 73a05846cfb2..adc8a98c548d 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -15,6 +15,10 @@ Components
    ``supervised_embeddings``, and ``spacy_sklearn`` is now known as
    ``pretrained_embeddings_spacy``. Please update your code if you are using these.
 
+.. note::
+    We deprecated all pre-defined pipeline templates. Take a look at :ref:`choosing-a-pipeline`
+    to decide on what components you should use in your configuration file.
+
 This is a reference of the configuration options for every built-in
 component in Rasa NLU. If you want to build a custom component, check
 out :ref:`custom-nlu-components`.
@@ -32,8 +36,8 @@ MitieNLP
 ~~~~~~~~
 
 :Short: MITIE initializer
-:Outputs: nothing
-:Requires: nothing
+:Outputs: Nothing
+:Requires: Nothing
 :Description:
     Initializes mitie structures. Every mitie component relies on this,
     hence this should be put at the beginning
@@ -57,15 +61,15 @@ MitieNLP
 SpacyNLP
 ~~~~~~~~
 
-:Short: spacy language initializer
-:Outputs: nothing
-:Requires: nothing
+:Short: spaCy language initializer
+:Outputs: Nothing
+:Requires: Nothing
 :Description:
-    Initializes spacy structures. Every spacy component relies on this, hence this should be put at the beginning
-    of every pipeline that uses any spacy components.
+    Initializes spaCy structures. Every spaCy component relies on this, hence this should be put at the beginning
+    of every pipeline that uses any spaCy components.
 :Configuration:
     Language model, default will use the configured language.
-    If the spacy model to be used has a name that is different from the language tag (``"en"``, ``"de"``, etc.),
+    If the spaCy model to be used has a name that is different from the language tag (``"en"``, ``"de"``, etc.),
     the model name can be specified using this configuration variable. The name will be passed to ``spacy.load(name)``.
 
     .. code-block:: yaml
@@ -82,6 +86,217 @@ SpacyNLP
           # between these two words, therefore setting this to `true`.
           case_sensitive: false
 
+    For more information on how to obtain the spaCy models, head over to
+    :ref:`installing SpaCy <install-spacy>`.
+
+.. _HFTransformersNLP:
+
+HFTransformersNLP
+~~~~~~~~~~~~~~~~~
+
+:Short: HuggingFace's Transformers based pre-trained language model initializer
+:Outputs: Nothing
+:Requires: Nothing
+:Description:
+    Initializes specified pre-trained language model from HuggingFace's `Transformers library
+    <https://huggingface.co/transformers/>`__.  The component applies language model specific tokenization and
+    featurization to compute sequence and sentence level representations for each example in the training data.
+    Include :ref:`LanguageModelTokenizer` and :ref:`LanguageModelFeaturizer` to utilize the output of this
+    component for downstream NLU models.
+
+     .. note:: To use ``HFTransformersNLP`` component, install Rasa Open Source with ``pip install rasa[transformers]``.
+
+:Configuration:
+    .. code-block:: yaml
+
+        pipeline:
+          - name: HFTransformersNLP
+
+            # Name of the language model to use
+            model_name: "bert"
+
+            # Shortcut name to specify architecture variation of the above model. Full list of supported architectures
+            # can be found at https://huggingface.co/transformers/pretrained_models.html . If left empty, it uses the
+            # default model architecture that original transformers library loads
+            model_weights: "bert-base-uncased"
+
+        #    +----------------+--------------+-------------------------+
+        #    | Language Model | Parameter    | Default value for       |
+        #    |                | "model_name" | "model_weights"         |
+        #    +----------------+--------------+-------------------------+
+        #    | BERT           | bert         | bert-base-uncased       |
+        #    +----------------+--------------+-------------------------+
+        #    | GPT            | gpt          | openai-gpt              |
+        #    +----------------+--------------+-------------------------+
+        #    | GPT-2          | gpt2         | gpt2                    |
+        #    +----------------+--------------+-------------------------+
+        #    | XLNet          | xlnet        | xlnet-base-cased        |
+        #    +----------------+--------------+-------------------------+
+        #    | DistilBERT     | distilbert   | distilbert-base-uncased |
+        #    +----------------+--------------+-------------------------+
+        #    | RoBERTa        | roberta      | roberta-base            |
+        #    +----------------+--------------+-------------------------+
+
+
+
+.. _tokenizers:
+
+Tokenizers
+----------
+
+Tokenizers split text into tokens.
+If you want to split intents into multiple labels, e.g. for predicting multiple intents or for
+modeling hierarchical intent structure, use these flags with any tokenizer:
+
+- ``intent_tokenization_flag`` indicates whether to tokenize intent labels or not. Set it to ``True``, so that intent
+  labels are tokenized.
+- ``intent_split_symbol`` sets the delimiter string to split the intent labels, default is underscore
+  (``_``).
+
+    .. note:: All tokenizer add an additional token ``__CLS__`` to the end of the list of tokens when tokenizing
+              text and responses.
+
+.. _WhitespaceTokenizer:
+
+WhitespaceTokenizer
+~~~~~~~~~~~~~~~~~~~
+
+:Short: Tokenizer using whitespaces as a separator
+:Outputs: ``tokens`` for user messages, responses (if present), and intents (if specified)
+:Requires: Nothing
+:Description:
+    Creates a token for every whitespace separated character sequence.
+:Configuration:
+    Make the tokenizer case insensitive by adding the ``case_sensitive: False`` option, the
+    default being ``case_sensitive: True``.
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "WhitespaceTokenizer"
+          # Flag to check whether to split intents
+          "intent_tokenization_flag": False
+          # Symbol on which intent should be split
+          "intent_split_symbol": "_"
+          # Text will be tokenized with case sensitive as default
+          "case_sensitive": True
+
+
+JiebaTokenizer
+~~~~~~~~~~~~~~
+
+:Short: Tokenizer using Jieba for Chinese language
+:Outputs: ``tokens`` for user messages, responses (if present), and intents (if specified)
+:Requires: Nothing
+:Description:
+    Creates tokens using the Jieba tokenizer specifically for Chinese
+    language. It will only work for the Chinese language.
+
+    .. note::
+        To use ``JiebaTokenizer`` you need to install Jieba with ``pip install jieba``.
+
+:Configuration:
+    User's custom dictionary files can be auto loaded by specifying the files' directory path via ``dictionary_path``.
+    If the ``dictionary_path`` is ``None`` (the default), then no custom dictionary will be used.
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "JiebaTokenizer"
+          dictionary_path: "path/to/custom/dictionary/dir"
+          # Flag to check whether to split intents
+          "intent_tokenization_flag": False
+          # Symbol on which intent should be split
+          "intent_split_symbol": "_"
+
+
+MitieTokenizer
+~~~~~~~~~~~~~~
+
+:Short: Tokenizer using MITIE
+:Outputs: ``tokens`` for user messages, responses (if present), and intents (if specified)
+:Requires: :ref:`MitieNLP`
+:Description: Creates tokens using the MITIE tokenizer.
+:Configuration:
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "MitieTokenizer"
+          # Flag to check whether to split intents
+          "intent_tokenization_flag": False
+          # Symbol on which intent should be split
+          "intent_split_symbol": "_"
+
+SpacyTokenizer
+~~~~~~~~~~~~~~
+
+:Short: Tokenizer using spaCy
+:Outputs: ``tokens`` for user messages, responses (if present), and intents (if specified)
+:Requires: :ref:`SpacyNLP`
+:Description:
+    Creates tokens using the spaCy tokenizer.
+:Configuration:
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "SpacyTokenizer"
+          # Flag to check whether to split intents
+          "intent_tokenization_flag": False
+          # Symbol on which intent should be split
+          "intent_split_symbol": "_"
+
+.. _ConveRTTokenizer:
+
+ConveRTTokenizer
+~~~~~~~~~~~~~~~~
+
+:Short: Tokenizer using `ConveRT <https://github.com/PolyAI-LDN/polyai-models#convert>`__ model.
+:Outputs: ``tokens`` for user messages, responses (if present), and intents (if specified)
+:Requires: Nothing
+:Description:
+    Creates tokens using the ConveRT tokenizer. Must be used whenever the :ref:`ConveRTFeaturizer` is used.
+:Configuration:
+    Make the tokenizer case insensitive by adding the ``case_sensitive: False`` option, the
+    default being ``case_sensitive: True``.
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "ConveRTTokenizer"
+          # Flag to check whether to split intents
+          "intent_tokenization_flag": False
+          # Symbol on which intent should be split
+          "intent_split_symbol": "_"
+          # Text will be tokenized with case sensitive as default
+          "case_sensitive": True
+
+
+.. _LanguageModelTokenizer:
+
+LanguageModelTokenizer
+~~~~~~~~~~~~~~~~~~~~~~
+
+:Short: Tokenizer from pre-trained language models
+:Outputs: ``tokens`` for user messages, responses (if present), and intents (if specified)
+:Requires: :ref:`HFTransformersNLP`
+:Description:
+    Creates tokens using the pre-trained language model specified in upstream :ref:`HFTransformersNLP` component.
+    Must be used whenever the :ref:`LanguageModelFeaturizer` is used.
+:Configuration:
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "LanguageModelTokenizer"
+          # Flag to check whether to split intents
+          "intent_tokenization_flag": False
+          # Symbol on which intent should be split
+          "intent_split_symbol": "_"
+
+
+
 .. _text-featurizers:
 
 Text Featurizers
@@ -93,76 +308,96 @@ As those feature vectors would normally take up a lot of memory, we store them a
 Sparse features only store the values that are non zero and their positions in the vector.
 Thus, we save a lot of memory and are able to train on larger datasets.
 
-By default all featurizers will return a matrix of length (number-of-tokens x feature-dimension).
+By default all featurizers will return a matrix of length ``(number-of-tokens x feature-dimension)``.
 So, the returned matrix will have a feature vector for every token.
 This allows us to train sequence models.
 However, the additional token at the end (e.g. ``__CLS__``) contains features for the complete utterance.
 This feature vector can be used in any non-sequence model.
 The corresponding classifier can therefore decide what kind of features to use.
 
+
+.. _MitieFeaturizer:
+
 MitieFeaturizer
 ~~~~~~~~~~~~~~~
 
-:Short: MITIE intent featurizer
-:Outputs: nothing, used as an input to intent classifiers that need intent features (e.g. ``SklearnIntentClassifier``)
+:Short:
+    Creates a vector representation of user message and response (if specified) using the MITIE featurizer.
+:Outputs: ``dense_features`` for user messages and responses
 :Requires: :ref:`MitieNLP`
 :Type: Dense featurizer
 :Description:
-    Creates feature for intent classification using the MITIE featurizer.
+    Creates features for entity extraction, intent classification, and response classification using the MITIE
+    featurizer.
 
     .. note::
 
-        NOT used by the ``MitieIntentClassifier`` component. Currently, only ``SklearnIntentClassifier`` is able
-        to use precomputed features.
+        NOT used by the ``MitieIntentClassifier`` component. But can be used by any component later in the pipeline
+        that makes use of ``dense_features``.
 
 :Configuration:
+    The sentence vector, i.e. the vector of the ``__CLS__`` token, can be calculated in two different ways, either via
+    mean or via max pooling. You can specify the pooling method in your configuration file with the option ``pooling``.
+    The default pooling method is set to ``mean``.
 
     .. code-block:: yaml
 
         pipeline:
         - name: "MitieFeaturizer"
+          # Specify what pooling operation should be used to calculate the vector of
+          # the __CLS__ token. Available options: 'mean' and 'max'.
+          "pooling": "mean"
 
 
+.. _SpacyFeaturizer:
 
 SpacyFeaturizer
 ~~~~~~~~~~~~~~~
 
-:Short: spacy intent featurizer
-:Outputs: nothing, used as an input to intent classifiers that need intent features (e.g. ``SklearnIntentClassifier``)
+:Short:
+    Creates a vector representation of user message and response (if specified) using the spaCy featurizer.
+:Outputs: ``dense_features`` for user messages and responses
 :Requires: :ref:`SpacyNLP`
 :Type: Dense featurizer
 :Description:
-    Creates feature for intent classification using the spacy featurizer.
+    Creates features for entity extraction, intent classification, and response classification using the spaCy
+    featurizer.
 :Configuration:
+    The sentence vector, i.e. the vector of the ``__CLS__`` token, can be calculated in two different ways, either via
+    mean or via max pooling. You can specify the pooling method in your configuration file with the option ``pooling``.
+    The default pooling method is set to ``mean``.
 
     .. code-block:: yaml
 
         pipeline:
         - name: "SpacyFeaturizer"
+          # Specify what pooling operation should be used to calculate the vector of
+          # the __CLS__ token. Available options: 'mean' and 'max'.
+          "pooling": "mean"
 
 
+.. _ConveRTFeaturizer:
+
 ConveRTFeaturizer
 ~~~~~~~~~~~~~~~~~
 
 :Short:
     Creates a vector representation of user message and response (if specified) using
-    `ConveRT <https://github.com/PolyAI-LDN/polyai-models>`_ model.
-:Outputs:
-    nothing, used as an input to intent classifiers and response selectors that need intent features and response
-    features respectively (e.g. ``EmbeddingIntentClassifier`` and ``ResponseSelector``)
+    `ConveRT <https://github.com/PolyAI-LDN/polyai-models>`__ model.
+:Outputs: ``dense_features`` for user messages and responses
 :Requires: :ref:`ConveRTTokenizer`
 :Type: Dense featurizer
 :Description:
-    Creates features for intent classification and response selection.
+    Creates features for entity extraction, intent classification, and response selection.
     Uses the `default signature <https://github.com/PolyAI-LDN/polyai-models#tfhub-signatures>`_ to compute vector
     representations of input text.
 
-    .. warning::
-        Since ``ConveRT`` model is trained only on an english corpus of conversations, this featurizer should only
-        be used if your training data is in english language.
+    .. note::
+        Since ``ConveRT`` model is trained only on an English corpus of conversations, this featurizer should only
+        be used if your training data is in English language.
 
     .. note::
-        To use ``ConveRTFeaturizer`` you need to install additional tensorflow libraries (``tensorflow_text`` and
+        To use ``ConveRTFeaturizer`` you need to install additional TensorFlow libraries (``tensorflow_text`` and
         ``tensorflow_hub``). You should do a pip install of Rasa with ``pip install rasa[convert]`` to install those.
 
 :Configuration:
@@ -173,46 +408,77 @@ ConveRTFeaturizer
         - name: "ConveRTFeaturizer"
 
 
+.. _LanguageModelFeaturizer:
+
+LanguageModelFeaturizer
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+:Short:
+    Creates a vector representation of user message and response (if specified) using a pre-trained language model.
+:Outputs: ``dense_features`` for user messages and responses
+:Requires: :ref:`HFTransformersNLP`
+:Type: Dense featurizer
+:Description:
+    Creates features for entity extraction, intent classification, and response selection.
+    Uses the pre-trained language model specified in upstream :ref:`HFTransformersNLP` component to compute vector
+    representations of input text.
+
+    .. note::
+        Please make sure that you use a language model which is pre-trained on the same language corpus as that of your
+        training data.
+
+:Configuration:
+
+    Include :ref:`HFTransformersNLP` and :ref:`LanguageModelTokenizer` components before this component. Use
+    :ref:`LanguageModelTokenizer` to ensure tokens are correctly set for all components throughout the pipeline.
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "LanguageModelFeaturizer"
+
+
+.. _RegexFeaturizer:
+
 RegexFeaturizer
 ~~~~~~~~~~~~~~~
 
-:Short: regex feature creation to support intent and entity classification
-:Outputs: ``text_features`` and ``tokens.pattern``
-:Requires: nothing
+:Short: Creates a vector representation of user message using regular expressions.
+:Outputs: ``sparse_features`` for user messages and ``tokens.pattern``
+:Requires: ``tokens``
 :Type: Sparse featurizer
 :Description:
     Creates features for entity extraction and intent classification.
-    During training, the regex intent featurizer creates a list of `regular expressions` defined in the training
+    During training the ``RegexFeaturizer`` creates a list of `regular expressions` defined in the training
     data format.
     For each regex, a feature will be set marking whether this expression was found in the input, which will later
     be fed into intent classifier / entity extractor to simplify classification (assuming the classifier has learned
-    during the training phase, that this set feature indicates a certain intent).
-    Regex features for entity extraction are currently only supported by the ``CRFEntityExtractor`` component!
+    during the training phase, that this set feature indicates a certain intent / entity).
+    Regex features for entity extraction are currently only supported by the :ref:`CRFEntityExtractor` and the
+    :ref:`diet-classifier` components!
 
-    .. note:: There needs to be a tokenizer previous to this featurizer in the pipeline!
+:Configuration:
 
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "RegexFeaturizer"
+
+.. _CountVectorsFeaturizer:
 
 CountVectorsFeaturizer
 ~~~~~~~~~~~~~~~~~~~~~~
 
-:Short: Creates bag-of-words representation of user message and label (intent and response) features
-:Outputs:
-   nothing, used as an input to intent classifiers that
-   need bag-of-words representation of intent features
-   (e.g. ``EmbeddingIntentClassifier``)
-:Requires: nothing
+:Short: Creates bag-of-words representation of user messages, intents, and responses.
+:Outputs: ``sparse_features`` for user messages, intents, and responses
+:Requires: ``tokens``
 :Type: Sparse featurizer
 :Description:
     Creates features for intent classification and response selection.
-    Creates bag-of-words representation of user message and label features using
+    Creates bag-of-words representation of user message, intent, and response using
     `sklearn's CountVectorizer <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_.
     All tokens which consist only of digits (e.g. 123 and 99 but not a123d) will be assigned to the same feature.
 
-    .. note::
-        If the words in the model language cannot be split by whitespace,
-        a language-specific tokenizer is required in the pipeline before this component
-        (e.g. using ``JiebaTokenizer`` for Chinese).
-
 :Configuration:
     See `sklearn's CountVectorizer docs <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_
     for detailed description of the configuration parameters.
@@ -224,11 +490,11 @@ CountVectorsFeaturizer
     .. note::
         Option ‘char_wb’ creates character n-grams only from text inside word boundaries;
         n-grams at the edges of words are padded with space.
-        This option can be used to create `Subword Semantic Hashing <https://arxiv.org/abs/1810.07150>`_
+        This option can be used to create `Subword Semantic Hashing <https://arxiv.org/abs/1810.07150>`_.
 
     .. note::
         For character n-grams do not forget to increase ``min_ngram`` and ``max_ngram`` parameters.
-        Otherwise the vocabulary will contain only single letters
+        Otherwise the vocabulary will contain only single letters.
 
     Handling Out-Of-Vacabulary (OOV) words:
 
@@ -306,10 +572,78 @@ CountVectorsFeaturizer
           OOV_token: None  # string or None
           OOV_words: []  # list of strings
 
+.. _LexicalSyntacticFeaturizer:
+
+LexicalSyntacticFeaturizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:Short: Creates lexical and syntactic features for a user message to support entity extraction.
+:Outputs: ``sparse_features`` for user messages
+:Requires: ``tokens``
+:Type: Sparse featurizer
+:Description:
+    Creates features for entity extraction.
+    Moves with a sliding window over every token in the user message and creates features according to the
+    configuration (see below). As a default configuration is present, you don't need to specify a configuration.
+:Configuration:
+    You can configure what kind of lexical and syntactic features the featurizer should extract.
+    The following features are available:
+
+    .. code-block:: yaml
+
+        # ==============  ==========================================================================================
+        # Feature Name    Description
+        # ==============  ==========================================================================================
+        # BOS             Checks if the token is at the beginning of the sentence.
+        # EOS             Checks if the token is at the end of the sentence.
+        # low             Checks if the token is lower case.
+        # upper           Checks if the token is upper case.
+        # title           Checks if the token starts with an uppercase character and all remaining characters are
+        #                 lowercased.
+        # digit           Checks if the token contains just digits.
+        # prefix5         Take the first five characters of the token.
+        # prefix2         Take the first two characters of the token.
+        # suffix5         Take the last five characters of the token.
+        # suffix3         Take the last three characters of the token.
+        # suffix2         Take the last two characters of the token.
+        # suffix1         Take the last character of the token.
+        # pos             Take the Part-of-Speech tag of the token (``SpacyTokenizer`` required).
+        # pos2            Take the first two characters of the Part-of-Speech tag of the token
+        #                 (``SpacyTokenizer`` required).
+        # ==============  ==========================================================================================
+
+    As the featurizer is moving over the tokens in a user message with a sliding window, you can define features for
+    previous tokens, the current token, and the next tokens in the sliding window.
+    You define the features as a [before, token, after] array.
+    If you want to define features for the token before, the current token, and the token after,
+    your features configuration would look like this:
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "LexicalSyntacticFeaturizer":
+          "features": [
+            ["low", "title", "upper"],
+            [
+              "BOS",
+              "EOS",
+              "low",
+              "upper",
+              "title",
+              "digit",
+            ],
+            ["low", "title", "upper"],
+          ]
+
+    This configuration is also the default configuration.
+
+    .. note:: If you want to make use of ``pos`` or ``pos2`` you need to add ``SpacyTokenizer`` to your pipeline.
+
 
 Intent Classifiers
 ------------------
 
+Intent classifiers assign one of the intents defined in the domain file to incoming user messages.
 
 MitieIntentClassifier
 ~~~~~~~~~~~~~~~~~~~~~
@@ -318,7 +652,7 @@ MitieIntentClassifier
     MITIE intent classifier (using a
     `text categorizer <https://github.com/mit-nlp/MITIE/blob/master/examples/python/text_categorizer_pure_model.py>`_)
 :Outputs: ``intent``
-:Requires: A tokenizer and a featurizer
+:Requires: ``tokens`` for user message
 :Output-Example:
 
     .. code-block:: json
@@ -342,9 +676,13 @@ MitieIntentClassifier
 SklearnIntentClassifier
 ~~~~~~~~~~~~~~~~~~~~~~~
 
-:Short: sklearn intent classifier
+.. warning::
+    ``SklearnIntentClassifier`` is deprecated and should be replaced by ``DIETClassifier``. See
+    :ref:`migration guide <migration-to-rasa-1.8>` for more details.
+
+:Short: Sklearn intent classifier
 :Outputs: ``intent`` and ``intent_ranking``
-:Requires: A featurizer
+:Requires: ``dense_features`` for user messages
 :Output-Example:
 
     .. code-block:: json
@@ -364,15 +702,14 @@ SklearnIntentClassifier
         }
 
 :Description:
-    The sklearn intent classifier trains a linear SVM which gets optimized using a grid search. In addition
-    to other classifiers it also provides rankings of the labels that did not "win". The spacy intent classifier
-    needs to be preceded by a featurizer in the pipeline. This featurizer creates the features used for the
-    classification.
+    The sklearn intent classifier trains a linear SVM which gets optimized using a grid search. It also provides
+    rankings of the labels that did not "win". The ``SklearnIntentClassifier`` needs to be preceded by a dense
+    featurizer in the pipeline. This dense featurizer creates the features used for the classification.
 
 :Configuration:
     During the training of the SVM a hyperparameter search is run to
     find the best parameter set. In the config, you can specify the parameters
-    that will get tried
+    that will get tried.
 
     .. code-block:: yaml
 
@@ -386,130 +723,206 @@ SklearnIntentClassifier
           # This is used with the ``C`` hyperparameter in GridSearchCV.
           kernels: ["linear"]
 
+.. _embedding-intent-classifier:
+
 EmbeddingIntentClassifier
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-:Short: Embedding intent classifier
+.. warning::
+    ``EmbeddingIntentClassifier`` is deprecated and should be replaced by ``DIETClassifier``. See
+    :ref:`migration guide <migration-to-rasa-1.8>` for more details.
+
+:Short: Embedding intent classifier for intent classification
 :Outputs: ``intent`` and ``intent_ranking``
-:Requires: A featurizer
+:Requires: ``dense_features`` and/or ``sparse_features`` for user messages, and optionally the intent
 :Output-Example:
 
     .. code-block:: json
 
         {
-            "intent": {"name": "greet", "confidence": 0.8343},
+            "intent": {"name": "greet", "confidence": 0.78343},
             "intent_ranking": [
                 {
-                    "confidence": 0.385910906220309,
+                    "confidence": 0.1485910906220309,
                     "name": "goodbye"
                 },
                 {
-                    "confidence": 0.28161531595656784,
+                    "confidence": 0.08161531595656784,
                     "name": "restaurant_search"
                 }
             ]
         }
 
 :Description:
-    The embedding intent classifier embeds user inputs and intent labels into the same space.
+    The ``EmbeddingIntentClassifier`` embeds user inputs and intent labels into the same space.
     Supervised embeddings are trained by maximizing similarity between them.
     This algorithm is based on `StarSpace <https://arxiv.org/abs/1709.03856>`_.
     However, in this implementation the loss function is slightly different and
     additional hidden layers are added together with dropout.
     This algorithm also provides similarity rankings of the labels that did not "win".
 
-    The embedding intent classifier needs to be preceded by a featurizer in the pipeline.
-    This featurizer creates the features used for the embeddings.
-    It is recommended to use ``CountVectorsFeaturizer`` that can be optionally preceded
-    by ``SpacyNLP`` and ``SpacyTokenizer``.
-
-    .. note:: If during prediction time a message contains **only** words unseen during training,
-              and no Out-Of-Vacabulary preprocessor was used,
-              empty intent ``None`` is predicted with confidence ``0.0``.
+    .. note:: If during prediction time a message contains **only** words unseen during training
+              and no Out-Of-Vacabulary preprocessor was used, an empty intent ``None`` is predicted with confidence
+              ``0.0``. This might happen if you only use the :ref:`CountVectorsFeaturizer` with a ``word`` analyzer
+              as featurizer. If you use the ``char_wb`` analyzer, you should always get an intent with a confidence
+              value ``> 0.0``.
 
 :Configuration:
 
-    The algorithm also has hyperparameters to control:
+    The following hyperparameters can be set:
 
         - neural network's architecture:
 
-            - ``hidden_layers_sizes_a`` sets a list of hidden layer sizes before
+            - ``hidden_layers_sizes.text`` sets a list of hidden layer sizes before
               the embedding layer for user inputs, the number of hidden layers
-              is equal to the length of the list
-            - ``hidden_layers_sizes_b`` sets a list of hidden layer sizes before
+              is equal to the length of the list.
+            - ``hidden_layers_sizes.label`` sets a list of hidden layer sizes before
               the embedding layer for intent labels, the number of hidden layers
-              is equal to the length of the list
-            - ``share_hidden`` if set to True, shares the hidden layers between user inputs and intent label
+              is equal to the length of the list.
+            - ``share_hidden_layers`` if set to True, shares the hidden layers between user inputs and intent label.
 
         - training:
 
             - ``batch_size`` sets the number of training examples in one
               forward/backward pass, the higher the batch size, the more
-              memory space you'll need;
+              memory space you'll need.
             - ``batch_strategy`` sets the type of batching strategy,
-              it should be either ``sequence`` or ``balanced``;
+              it should be either ``sequence`` or ``balanced``.
             - ``epochs`` sets the number of times the algorithm will see
               training data, where one ``epoch`` equals one forward pass and
-              one backward pass of all the training examples;
-            - ``random_seed`` if set to any int will get reproducible
-              training results for the same inputs;
+              one backward pass of all the training examples.
+            - ``random_seed`` if set you will get reproducible
+              training results for the same inputs.
+            - ``learning_rate`` sets the initial learning rate of the optimizer.
 
         - embedding:
 
-            - ``embed_dim`` sets the dimension of embedding space;
-            - ``num_neg`` sets the number of incorrect intent labels,
-              the algorithm will minimize their similarity to the user
-              input during training;
+            - ``dense_dimension.text`` sets the dense dimensions for user inputs to use for sparse
+              tensors if no dense features are present.
+            - ``dense_dimension.label`` sets the dense dimensions for intent labels to use for sparse
+              tensors if no dense features are present.
+            - ``embedding_dimension`` sets the dimension of embedding space.
+            - ``number_of_negative_examples`` sets the number of incorrect intent labels.
+              The algorithm will minimize their similarity to the user
+              input during training.
             - ``similarity_type`` sets the type of the similarity,
               it should be either ``auto``, ``cosine`` or ``inner``,
               if ``auto``, it will be set depending on ``loss_type``,
-              ``inner`` for ``softmax``, ``cosine`` for ``margin``;
+              ``inner`` for ``softmax``, ``cosine`` for ``margin``.
             - ``loss_type`` sets the type of the loss function,
-              it should be either ``softmax`` or ``margin``;
+              it should be either ``softmax`` or ``margin``.
             - ``ranking_length`` defines the number of top confidences over
-              which to normalize ranking results if ``loss_type: "softmax"``;
-              to turn off normalization set it to 0
-            - ``mu_pos`` controls how similar the algorithm should try
+              which to normalize ranking results if ``loss_type: "softmax"``.
+              To turn off normalization set it to 0.
+            - ``maximum_positive_similarity`` controls how similar the algorithm should try
               to make embedding vectors for correct intent labels,
-              used only if ``loss_type`` is set to ``margin``;
-            - ``mu_neg`` controls maximum negative similarity for
-              incorrect intents,
-              used only if ``loss_type`` is set to ``margin``;
-            - ``use_max_sim_neg`` if ``true`` the algorithm only
+              used only if ``loss_type`` is set to ``margin``.
+            - ``maximum_negative_similarity`` controls maximum negative similarity for
+              incorrect intents, used only if ``loss_type`` is set to ``margin``.
+            - ``use_maximum_negative_similarity`` if ``true`` the algorithm only
               minimizes maximum similarity over incorrect intent labels,
-              used only if ``loss_type`` is set to ``margin``;
+              used only if ``loss_type`` is set to ``margin``.
             - ``scale_loss`` if ``true`` the algorithm will downscale the loss
               for examples where correct label is predicted with high confidence,
-              used only if ``loss_type`` is set to ``softmax``;
+              used only if ``loss_type`` is set to ``softmax``.
 
         - regularization:
 
-            - ``C2`` sets the scale of L2 regularization
-            - ``C_emb`` sets the scale of how important is to minimize
-              the maximum similarity between embeddings of different intent labels;
-            - ``droprate`` sets the dropout rate, it should be
-              between ``0`` and ``1``, e.g. ``droprate=0.1``
-              would drop out ``10%`` of input units;
+            - ``regularization_constant`` sets the scale of L2 regularization. Higher values will result in more
+              regularization.
+            - ``negative_margin_scale`` sets the scale of how important is to minimize
+              the maximum similarity between embeddings of different intent labels.
+            - ``drop_rate`` sets the dropout rate, it should be
+              between ``0`` and ``1``, e.g. ``drop_rate=0.1`` would drop out ``10%`` of input units.
+            - ``weight_sparsity`` sets the sparsity of the weght kernels in dense layers.
+            - ``use_sparse_input_dropout`` specifies whether to apply dropout to sparse tensors or not.
 
-    .. note:: For ``cosine`` similarity ``mu_pos`` and ``mu_neg`` should be between ``-1`` and ``1``.
+    .. note:: For ``cosine`` similarity ``maximum_positive_similarity`` and ``maximum_negative_similarity`` should
+              be between ``-1`` and ``1``.
 
     .. note:: There is an option to use linearly increasing batch size. The idea comes from
               `<https://arxiv.org/abs/1711.00489>`_.
               In order to do it pass a list to ``batch_size``, e.g. ``"batch_size": [64, 256]`` (default behaviour).
               If constant ``batch_size`` is required, pass an ``int``, e.g. ``"batch_size": 64``.
 
-    In the config, you can specify these parameters.
-    The default values are defined in ``EmbeddingIntentClassifier.defaults``:
+    .. note:: Parameter ``maximum_negative_similarity`` is set to a negative value to mimic the original
+              starspace algorithm in the case ``maximum_negative_similarity = maximum_positive_similarity``
+              and ``use_maximum_negative_similarity = False``.
+              See `starspace paper <https://arxiv.org/abs/1709.03856>`_ for details.
 
-    .. literalinclude:: ../../rasa/nlu/classifiers/embedding_intent_classifier.py
-       :dedent: 4
-       :start-after: # default properties (DOC MARKER - don't remove)
-       :end-before: # end default properties (DOC MARKER - don't remove)
+    Default values:
 
-    .. note:: Parameter ``mu_neg`` is set to a negative value to mimic the original
-              starspace algorithm in the case ``mu_neg = mu_pos`` and ``use_max_sim_neg = False``.
-              See `starspace paper <https://arxiv.org/abs/1709.03856>`_ for details.
+    .. code-block:: yaml
 
+        pipeline:
+        - name: "EmbeddingIntentClassifier"
+            # ## Architecture of the used neural network
+            # Hidden layer sizes for layers before the embedding layers for user message
+            # and labels.
+            # The number of hidden layers is equal to the length of the corresponding
+            # list.
+            "hidden_layers_sizes": {"text": [256, 128], "label": []}
+            # Whether to share the hidden layer weights between user message and labels.
+            "share_hidden_layers": False
+            # ## Training parameters
+            # Initial and final batch sizes:
+            # Batch size will be linearly increased for each epoch.
+            "batch_size": [64, 256]
+            # Strategy used when creating batches.
+            # Can be either 'sequence' or 'balanced'.
+            "batch_strategy": "balanced"
+            # Number of epochs to train
+            "epochs": 300
+            # Set random seed to any 'int' to get reproducible results
+            "random_seed": None
+            # Initial learning rate for the optimizer
+            "learning_rate": 0.001
+            # ## Parameters for embeddings
+            # Dimension size of embedding vectors
+            "embedding_dimension": 20
+            # Default dense dimension to use if no dense features are present.
+            "dense_dimension": {"text": 512, "label": 20}
+            # The number of incorrect labels. The algorithm will minimize
+            # their similarity to the user input during training.
+            "number_of_negative_examples": 20
+            # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
+            "similarity_type": "auto"
+            # The type of the loss function, either 'softmax' or 'margin'.
+            "loss_type": "softmax"
+            # Number of top actions to normalize scores for loss type 'softmax'.
+            # Set to 0 to turn off normalization.
+            "ranking_length": 10
+            # Indicates how similar the algorithm should try to make embedding vectors
+            # for correct labels.
+            # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+            "maximum_positive_similarity": 0.8
+            # Maximum negative similarity for incorrect labels.
+            # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+            "maximum_negative_similarity": -0.4
+            # If 'True' the algorithm only minimizes maximum similarity over
+            # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
+            "use_maximum_negative_similarity": True
+            # Scale loss inverse proportionally to confidence of correct prediction
+            "scale_loss": True
+            # ## Regularization parameters
+            # The scale of regularization
+            "regularization_constant": 0.002
+            # The scale of how important is to minimize the maximum similarity
+            # between embeddings of different labels.
+            "negative_margin_scale": 0.8
+            # Sparsity of the weights in dense layers
+            "weight_sparsity": 0.8
+            # Dropout rate for encoder
+            "drop_rate": 0.2
+            # If 'True' apply dropout to sparse tensors
+            "use_sparse_input_dropout": False
+            # ## Evaluation parameters
+            # How often calculate validation accuracy.
+            # Small values may hurt performance, e.g. model accuracy.
+            "evaluate_every_number_of_epochs": 20
+            # How many examples to use for hold out validation set
+            # Large values may hurt performance, e.g. model accuracy.
+            "evaluate_on_number_of_examples": 0
 
 .. _keyword_intent_classifier:
 
@@ -518,7 +931,7 @@ KeywordIntentClassifier
 
 :Short: Simple keyword matching intent classifier, intended for small, short-term projects.
 :Outputs: ``intent``
-:Requires: nothing
+:Requires: Nothing
 
 :Output-Example:
 
@@ -536,7 +949,7 @@ KeywordIntentClassifier
     This means the entire example is the keyword, not the individual words in the example.
 
     .. note:: This classifier is intended only for small projects or to get started. If
-              you have few NLU training data you can use one of our pipelines
+              you have few NLU training data, you can take a look at the recommended pipelines in
               :ref:`choosing-a-pipeline`.
 
 :Configuration:
@@ -552,25 +965,18 @@ Selectors
 
 .. _response-selector:
 
-Response Selector
-~~~~~~~~~~~~~~~~~~
+ResponseSelector
+~~~~~~~~~~~~~~~~
 
 :Short: Response Selector
 :Outputs: A dictionary with key as ``direct_response_intent`` and value containing ``response`` and ``ranking``
-:Requires: A featurizer
+:Requires: ``dense_features`` and/or ``sparse_features`` for user messages and response
 
 :Output-Example:
 
     .. code-block:: json
 
         {
-            "text": "What is the recommend python version to install?",
-            "entities": [],
-            "intent": {"confidence": 0.6485910906220309, "name": "faq"},
-            "intent_ranking": [
-                {"confidence": 0.6485910906220309, "name": "faq"},
-                {"confidence": 0.1416153159565678, "name": "greet"}
-            ],
             "response_selector": {
               "faq": {
                 "response": {"confidence": 0.7356462617, "name": "Supports 3.5, 3.6 and 3.7, recommended version is 3.6"},
@@ -587,151 +993,147 @@ Response Selector
     Response Selector component can be used to build a response retrieval model to directly predict a bot response from
     a set of candidate responses. The prediction of this model is used by :ref:`retrieval-actions`.
     It embeds user inputs and response labels into the same space and follows the exact same
-    neural network architecture and optimization as the ``EmbeddingIntentClassifier``.
+    neural network architecture and optimization as the :ref:`diet-classifier`.
 
-    The response selector needs to be preceded by a featurizer in the pipeline.
-    This featurizer creates the features used for the embeddings.
-    It is recommended to use ``CountVectorsFeaturizer`` that can be optionally preceded
-    by ``SpacyNLP``.
-
-    .. note:: If during prediction time a message contains **only** words unseen during training,
-              and no Out-Of-Vacabulary preprocessor was used,
-              empty response ``None`` is predicted with confidence ``0.0``.
+    .. note:: If during prediction time a message contains **only** words unseen during training
+              and no Out-Of-Vacabulary preprocessor was used, an empty intent ``None`` is predicted with confidence
+              ``0.0``. This might happen if you only use the :ref:`CountVectorsFeaturizer` with a ``word`` analyzer
+              as featurizer. If you use the ``char_wb`` analyzer, you should always get an intent with a confidence
+              value ``> 0.0``.
 
 :Configuration:
 
-    The algorithm includes all the hyperparameters that ``EmbeddingIntentClassifier`` uses.
-    In addition, the component can also be configured to train a response selector for a particular retrieval intent
-
-        - ``retrieval_intent``: sets the name of the intent for which this response selector model is trained. Default ``None``
-
-    In the config, you can specify these parameters.
-    The default values are defined in ``ResponseSelector.defaults``:
-
-    .. literalinclude:: ../../rasa/nlu/selectors/embedding_response_selector.py
-       :dedent: 4
-       :start-after: # default properties (DOC MARKER - don't remove)
-       :end-before: # end default properties (DOC MARKER - don't remove)
-
-.. _tokenizers:
-
-Tokenizers
-----------
+    The algorithm includes all the hyperparameters that :ref:`diet-classifier` uses.
+    In addition, the component can also be configured to train a response selector for a particular retrieval intent.
 
-If you want to split intents into multiple labels, e.g. for predicting multiple intents or for
-modeling hierarchical intent structure, use these flags with any tokenizer:
+        - ``retrieval_intent`` sets the name of the intent for which this response selector model is trained.
+          Default is ``None``, i.e. the model is trained for all retrieval intents.
 
-- ``intent_tokenization_flag`` indicates whether to tokenize intent labels or not. By default this flag is set to
-  ``False``, intent will not be tokenized.
-- ``intent_split_symbol`` sets the delimiter string to split the intent labels, default is underscore
-  (``_``).
-
-    .. note:: All tokenizer add an additional token ``__CLS__`` to the end of the list of tokens when tokenizing
-              text and responses.
-
-WhitespaceTokenizer
-~~~~~~~~~~~~~~~~~~~
-
-:Short: Tokenizer using whitespaces as a separator
-:Outputs: nothing
-:Requires: nothing
-:Description:
-    Creates a token for every whitespace separated character sequence. Can be used to define tokens for the MITIE entity
-    extractor.
-:Configuration:
-    Make the tokenizer not case sensitive by adding the ``case_sensitive: false`` option. Default being ``case_sensitive: true``.
+    Default values:
 
     .. code-block:: yaml
 
         pipeline:
-        - name: "WhitespaceTokenizer"
-          case_sensitive: false
-
-JiebaTokenizer
-~~~~~~~~~~~~~~
-
-:Short: Tokenizer using Jieba for Chinese language
-:Outputs: nothing
-:Requires: nothing
-:Description:
-    Creates tokens using the Jieba tokenizer specifically for Chinese
-    language. For language other than Chinese, Jieba will work as
-    ``WhitespaceTokenizer``. Can be used to define tokens for the
-    MITIE entity extractor. Make sure to install Jieba, ``pip install jieba``.
-:Configuration:
-    User's custom dictionary files can be auto loaded by specific the files' directory path via ``dictionary_path``
-
-    .. code-block:: yaml
-
-        pipeline:
-        - name: "JiebaTokenizer"
-          dictionary_path: "path/to/custom/dictionary/dir"
-
-If the ``dictionary_path`` is ``None`` (the default), then no custom dictionary will be used.
-
-MitieTokenizer
-~~~~~~~~~~~~~~
-
-:Short: Tokenizer using MITIE
-:Outputs: nothing
-:Requires: :ref:`MitieNLP`
-:Description:
-    Creates tokens using the MITIE tokenizer. Can be used to define
-    tokens for the MITIE entity extractor.
-:Configuration:
-
-    .. code-block:: yaml
-
-        pipeline:
-        - name: "MitieTokenizer"
-
-SpacyTokenizer
-~~~~~~~~~~~~~~
-
-:Short: Tokenizer using spacy
-:Outputs: nothing
-:Requires: :ref:`SpacyNLP`
-:Description:
-    Creates tokens using the spacy tokenizer. Can be used to define
-    tokens for the MITIE entity extractor.
-
-.. _ConveRTTokenizer:
-
-ConveRTTokenizer
-~~~~~~~~~~~~~~~~
-
-:Short: Tokenizer using ConveRT
-:Outputs: nothing
-:Requires: nothing
-:Description:
-    Creates tokens using the ConveRT tokenizer. Must be used whenever the ``ConveRTFeaturizer`` is used.
-
+          - name: "ResponseSelector"
+            # ## Architecture of the used neural network
+            # Hidden layer sizes for layers before the embedding layers for user message
+            # and labels.
+            # The number of hidden layers is equal to the length of the corresponding
+            # list.
+            hidden_layers_sizes: {"text": [256, 128], "label": [256, 128]}
+            # Whether to share the hidden layer weights between input words and responses
+            "share_hidden_layers": False
+            # Number of units in transformer
+            "transformer_size": None
+            # Number of transformer layers
+            "number_of_transformer_layers": 0
+            # Number of attention heads in transformer
+            "number_of_attention_heads": 4
+            # If 'True' use key relative embeddings in attention
+            "use_key_relative_attention": False
+            # If 'True' use key relative embeddings in attention
+            "use_value_relative_attention": False
+            # Max position for relative embeddings
+            "max_relative_position": None
+            # Use a unidirectional or bidirectional encoder.
+            "unidirectional_encoder": False
+            # ## Training parameters
+            # Initial and final batch sizes:
+            # Batch size will be linearly increased for each epoch.
+            "batch_size": [64, 256]
+            # Strategy used when creating batches.
+            # Can be either 'sequence' or 'balanced'.
+            "batch_strategy": "balanced"
+            # Number of epochs to train
+            "epochs": 300
+            # Set random seed to any 'int' to get reproducible results
+            "random_seed": None
+            # Initial learning rate for the optimizer
+            "learning_rate": 0.001
+            # ## Parameters for embeddings
+            # Dimension size of embedding vectors
+            "embedding_dimension": 20
+            # Default dense dimension to use if no dense features are present.
+            "dense_dimension": {"text": 512, "label": 512}
+            # The number of incorrect labels. The algorithm will minimize
+            # their similarity to the user input during training.
+            "number_of_negative_examples": 20
+            # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
+            "similarity_type": "auto"
+            # The type of the loss function, either 'softmax' or 'margin'.
+            "loss_type": "softmax"
+            # Number of top actions to normalize scores for loss type 'softmax'.
+            # Set to 0 to turn off normalization.
+            "ranking_length": 10
+            # Indicates how similar the algorithm should try to make embedding vectors
+            # for correct labels.
+            # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+            "maximum_positive_similarity": 0.8
+            # Maximum negative similarity for incorrect labels.
+            # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+            "maximum_negative_similarity": -0.4
+            # If 'True' the algorithm only minimizes maximum similarity over
+            # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
+            "use_maximum_negative_similarity": True
+            # Scale loss inverse proportionally to confidence of correct prediction
+            "scale_loss": True
+            # ## Regularization parameters
+            # The scale of regularization
+            "regularization_constant": 0.002
+            # Sparsity of the weights in dense layers
+            "weight_sparsity": 0.8
+            # The scale of how important is to minimize the maximum similarity
+            # between embeddings of different labels.
+            "negative_margin_scale": 0.8
+            # Dropout rate for encoder
+            "drop_rate": 0.2
+            # Dropout rate for attention
+            "drop_rate_attention": 0
+            # If 'True' apply dropout to sparse tensors
+            "use_sparse_input_dropout": False
+            # ## Evaluation parameters
+            # How often calculate validation accuracy.
+            # Small values may hurt performance, e.g. model accuracy.
+            "evaluate_every_number_of_epochs": 20
+            # How many examples to use for hold out validation set
+            # Large values may hurt performance, e.g. model accuracy.
+            "evaluate_on_number_of_examples": 0
+            # ## Selector config
+            # If 'True' random tokens of the input message will be masked and the model
+            # should predict those tokens.
+            "use_masked_language_model": False
+            # Name of the intent for which this response selector is to be trained
+            "retrieval_intent: None
 
 
 Entity Extractors
 -----------------
 
+Entity extractors extract entities, such as person names or locations, from the user message.
+
 MitieEntityExtractor
 ~~~~~~~~~~~~~~~~~~~~
 
 :Short: MITIE entity extraction (using a `MITIE NER trainer <https://github.com/mit-nlp/MITIE/blob/master/mitielib/src/ner_trainer.cpp>`_)
-:Outputs: appends ``entities``
-:Requires: :ref:`MitieNLP`
+:Outputs: ``entities``
+:Requires: :ref:`MitieNLP` and ``tokens``
 :Output-Example:
 
     .. code-block:: json
 
         {
-            "entities": [{"value": "New York City",
-                          "start": 20,
-                          "end": 33,
-                          "confidence": null,
-                          "entity": "city",
-                          "extractor": "MitieEntityExtractor"}]
+            "entities": [{
+                "value": "New York City",
+                "start": 20,
+                "end": 33,
+                "confidence": null,
+                "entity": "city",
+                "extractor": "MitieEntityExtractor"
+            }]
         }
 
 :Description:
-    This uses the MITIE entity extraction to find entities in a message. The underlying classifier
+    ``MitieEntityExtractor`` uses the MITIE entity extraction to find entities in a message. The underlying classifier
     is using a multi class linear SVM with a sparse linear kernel and custom features.
     The MITIE component does not provide entity confidence values.
 :Configuration:
@@ -747,28 +1149,30 @@ SpacyEntityExtractor
 ~~~~~~~~~~~~~~~~~~~~
 
 :Short: spaCy entity extraction
-:Outputs: appends ``entities``
+:Outputs: ``entities``
 :Requires: :ref:`SpacyNLP`
 :Output-Example:
 
     .. code-block:: json
 
         {
-            "entities": [{"value": "New York City",
-                          "start": 20,
-                          "end": 33,
-                          "entity": "city",
-                          "confidence": null,
-                          "extractor": "SpacyEntityExtractor"}]
+            "entities": [{
+                "value": "New York City",
+                "start": 20,
+                "end": 33,
+                "confidence": null,
+                "entity": "city",
+                "extractor": "SpacyEntityExtractor"
+            }]
         }
 
 :Description:
-    Using spaCy this component predicts the entities of a message. spacy uses a statistical BILOU transition model.
-    As of now, this component can only use the spacy builtin entity extraction models and can not be retrained.
+    Using spaCy this component predicts the entities of a message. spaCy uses a statistical BILOU transition model.
+    As of now, this component can only use the spaCy builtin entity extraction models and can not be retrained.
     This extractor does not provide any confidence scores.
 
 :Configuration:
-    Configure which dimensions, i.e. entity types, the spacy component
+    Configure which dimensions, i.e. entity types, the spaCy component
     should extract. A full list of available dimensions can be found in
     the `spaCy documentation <https://spacy.io/api/annotation#section-named-entities>`_.
     Leaving the dimensions option unspecified will extract all available dimensions.
@@ -784,100 +1188,162 @@ SpacyEntityExtractor
 EntitySynonymMapper
 ~~~~~~~~~~~~~~~~~~~
 
-
 :Short: Maps synonymous entity values to the same value.
-:Outputs: modifies existing entities that previous entity extraction components found
-:Requires: nothing
+:Outputs: Modifies existing entities that previous entity extraction components found.
+:Requires: Nothing
 :Description:
-    If the training data contains defined synonyms (by using the ``value`` attribute on the entity examples).
-    this component will make sure that detected entity values will be mapped to the same value. For example,
-    if your training data contains the following examples:
+    If the training data contains defined synonyms, this component will make sure that detected entity values will
+    be mapped to the same value. For example, if your training data contains the following examples:
 
     .. code-block:: json
 
-        [{
-          "text": "I moved to New York City",
-          "intent": "inform_relocation",
-          "entities": [{"value": "nyc",
-                        "start": 11,
-                        "end": 24,
-                        "entity": "city",
-                       }]
-        },
-        {
-          "text": "I got a new flat in NYC.",
-          "intent": "inform_relocation",
-          "entities": [{"value": "nyc",
-                        "start": 20,
-                        "end": 23,
-                        "entity": "city",
-                       }]
-        }]
-
-    This component will allow you to map the entities ``New York City`` and ``NYC`` to ``nyc``. The entitiy
+        [
+            {
+              "text": "I moved to New York City",
+              "intent": "inform_relocation",
+              "entities": [{
+                "value": "nyc",
+                "start": 11,
+                "end": 24,
+                "entity": "city",
+              }]
+            },
+            {
+              "text": "I got a new flat in NYC.",
+              "intent": "inform_relocation",
+              "entities": [{
+                "value": "nyc",
+                "start": 20,
+                "end": 23,
+                "entity": "city",
+              }]
+            }
+        ]
+
+    This component will allow you to map the entities ``New York City`` and ``NYC`` to ``nyc``. The entity
     extraction will return ``nyc`` even though the message contains ``NYC``. When this component changes an
-    exisiting entity, it appends itself to the processor list of this entity.
+    existing entity, it appends itself to the processor list of this entity.
+
+:Configuration:
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "EntitySynonymMapper"
+
+.. _CRFEntityExtractor:
 
 CRFEntityExtractor
 ~~~~~~~~~~~~~~~~~~
 
-:Short: conditional random field entity extraction
-:Outputs: appends ``entities``
-:Requires: A tokenizer
+:Short: Conditional random field (CRF) entity extraction
+:Outputs: ``entities``
+:Requires: ``tokens`` and ``dense_features`` (optional)
 :Output-Example:
 
     .. code-block:: json
 
         {
-            "entities": [{"value":"New York City",
-                          "start": 20,
-                          "end": 33,
-                          "entity": "city",
-                          "confidence": 0.874,
-                          "extractor": "CRFEntityExtractor"}]
+            "entities": [{
+                "value": "New York City",
+                "start": 20,
+                "end": 33,
+                "entity": "city",
+                "confidence": 0.874,
+                "extractor": "CRFEntityExtractor"
+            }]
         }
 
 :Description:
-    This component implements conditional random fields to do named entity recognition.
+    This component implements a conditional random fields (CRF) to do named entity recognition.
     CRFs can be thought of as an undirected Markov chain where the time steps are words
     and the states are entity classes. Features of the words (capitalisation, POS tagging,
     etc.) give probabilities to certain entity classes, as are transitions between
     neighbouring entity tags: the most likely set of tags is then calculated and returned.
-    If POS features are used (pos or pos2), spaCy has to be installed. If you want to use
-    additional features, such as pre-trained word embeddings, from any provided dense
-    featurizer, use ``"text_dense_features"``.
+
+    .. note::
+        If POS features are used (pos or pos2), you need to have ``SpacyTokenizer`` in your pipeline.
+
+    .. note::
+        If "pattern" features are used, you need to have ``RegexFeaturizer`` in your pipeline.
+
+    .. warning::
+        ``CRFEntityExtractor`` is deprecated and should be replaced by ``DIETClassifier``. See
+        :ref:`migration guide <migration-to-rasa-1.8>` for more details.
+
 :Configuration:
-   .. code-block:: yaml
+    ``CRFEntityExtractor`` has a list of default features to use.
+    However, you can overwrite the default configuration.
+    The following features are available:
+
+    .. code-block:: yaml
+
+        # ==============  ==========================================================================================
+        # Feature Name    Description
+        # ==============  ==========================================================================================
+        # low             Checks if the token is lower case.
+        # upper           Checks if the token is upper case.
+        # title           Checks if the token starts with an uppercase character and all remaining characters are
+        #                 lowercased.
+        # digit           Checks if the token contains just digits.
+        # prefix5         Take the first five characters of the token.
+        # prefix2         Take the first two characters of the token.
+        # suffix5         Take the last five characters of the token.
+        # suffix3         Take the last three characters of the token.
+        # suffix2         Take the last two characters of the token.
+        # suffix1         Take the last character of the token.
+        # pos             Take the Part-of-Speech tag of the token (``SpacyTokenizer`` required).
+        # pos2            Take the first two characters of the Part-of-Speech tag of the token
+        #                 (``SpacyTokenizer`` required).
+        # pattern         Take the patterns defined by ``RegexFeaturizer``.
+        # bias            Add an additional "bias" feature to the list of features.
+        # ==============  ==========================================================================================
+
+    As the featurizer is moving over the tokens in a user message with a sliding window, you can define features for
+    previous tokens, the current token, and the next tokens in the sliding window.
+    You define the features as [before, token, after] array.
+
+    Additional you can set a flag to determine whether to use the BILOU tagging schema or not.
+
+        - ``BILOU_flag`` determines whether to use BILOU tagging or not. Default ``True``.
+
+    .. code-block:: yaml
 
         pipeline:
         - name: "CRFEntityExtractor"
-          # The features are a ``[before, word, after]`` array with
-          # before, word, after holding keys about which
-          # features to use for each word, for example, ``"title"``
-          # in array before will have the feature
-          # "is the preceding word in title case?".
-          # Available features are:
-          # ``low``, ``title``, ``suffix5``, ``suffix3``, ``suffix2``,
-          # ``suffix1``, ``pos``, ``pos2``, ``prefix5``, ``prefix2``,
-          # ``bias``, ``upper``, ``digit``, ``pattern``, and ``text_dense_features``
-          features: [["low", "title"], ["bias", "suffix3"], ["upper", "pos", "pos2"]]
-
-          # The flag determines whether to use BILOU tagging or not. BILOU
-          # tagging is more rigorous however
-          # requires more examples per entity. Rule of thumb: use only
-          # if more than 100 examples per entity.
-          BILOU_flag: true
-
-          # This is the value given to sklearn_crfcuite.CRF tagger before training.
-          max_iterations: 50
-
-          # This is the value given to sklearn_crfcuite.CRF tagger before training.
-          # Specifies the L1 regularization coefficient.
-          L1_c: 0.1
-
-          # This is the value given to sklearn_crfcuite.CRF tagger before training.
-          # Specifies the L2 regularization coefficient.
-          L2_c: 0.1
+            # BILOU_flag determines whether to use BILOU tagging or not.
+            # More rigorous however requires more examples per entity
+            # rule of thumb: use only if more than 100 egs. per entity
+            "BILOU_flag": True
+            # crf_features is [before, token, after] array with before, token,
+            # after holding keys about which features to use for each token,
+            # for example, 'title' in array before will have the feature
+            # "is the preceding token in title case?"
+            # POS features require SpacyTokenizer
+            # pattern feature require RegexFeaturizer
+            "features": [
+                ["low", "title", "upper"],
+                [
+                    "bias",
+                    "low",
+                    "prefix5",
+                    "prefix2",
+                    "suffix5",
+                    "suffix3",
+                    "suffix2",
+                    "upper",
+                    "title",
+                    "digit",
+                    "pattern",
+                ],
+                ["low", "title", "upper"],
+            ]
+            # The maximum number of iterations for optimization algorithms.
+            "max_iterations": 50
+            # weight of the L1 regularization
+            "L1_c": 0.1
+            # weight of the L2 regularization
+            "L2_c": 0.1
 
 .. _DucklingHTTPExtractor:
 
@@ -886,19 +1352,21 @@ DucklingHTTPExtractor
 
 :Short: Duckling lets you extract common entities like dates,
         amounts of money, distances, and others in a number of languages.
-:Outputs: appends ``entities``
-:Requires: nothing
+:Outputs: ``entities``
+:Requires: Nothing
 :Output-Example:
 
     .. code-block:: json
 
         {
-            "entities": [{"end": 53,
-                          "entity": "time",
-                          "start": 48,
-                          "value": "2017-04-10T00:00:00.000+02:00",
-                          "confidence": 1.0,
-                          "extractor": "DucklingHTTPExtractor"}]
+            "entities": [{
+                "end": 53,
+                "entity": "time",
+                "start": 48,
+                "value": "2017-04-10T00:00:00.000+02:00",
+                "confidence": 1.0,
+                "extractor": "DucklingHTTPExtractor"
+            }]
         }
 
 :Description:
@@ -942,3 +1410,260 @@ DucklingHTTPExtractor
           # Timeout for receiving response from http url of the running duckling server
           # if not set the default timeout of duckling http url is set to 3 seconds.
           timeout : 3
+
+
+Combined Entity Extractors and Intent Classifiers
+-------------------------------------------------
+
+.. _diet-classifier:
+
+DIETClassifier
+~~~~~~~~~~~~~~
+
+:Short: Dual Intent Entity Transformer (DIET) used for intent classification and entity extraction
+:Outputs: ``entities``, ``intent`` and ``intent_ranking``
+:Requires: ``dense_features`` and/or ``sparse_features`` for user message and, and optionally the intent
+:Output-Example:
+
+    .. code-block:: json
+
+        {
+            "intent": {"name": "greet", "confidence": 0.8343},
+            "intent_ranking": [
+                {
+                    "confidence": 0.385910906220309,
+                    "name": "goodbye"
+                },
+                {
+                    "confidence": 0.28161531595656784,
+                    "name": "restaurant_search"
+                }
+            ],
+            "entities": [{
+                "end": 53,
+                "entity": "time",
+                "start": 48,
+                "value": "2017-04-10T00:00:00.000+02:00",
+                "confidence": 1.0,
+                "extractor": "DIETClassifier"
+            }]
+        }
+
+:Description:
+    DIET (Dual Intent and Entity Transformer) is a multi-task architecture for intent classification and entity
+    recognition. The architecture is based on a transformer which is shared for both tasks.
+    A sequence of entity labels is predicted through a Conditional Random Field (CRF) tagging layer on top of the
+    transformer output sequence corresponding to the input sequence of tokens.
+    The transformer output for the ``__CLS__`` token and intent labels are embedded into a single semantic vector
+    space. We use the dot-product loss to maximize the similarity with the target label and minimize
+    similarities with negative samples.
+
+    .. note:: If during prediction time a message contains **only** words unseen during training
+              and no Out-Of-Vacabulary preprocessor was used, an empty intent ``None`` is predicted with confidence
+              ``0.0``. This might happen if you only use the :ref:`CountVectorsFeaturizer` with a ``word`` analyzer
+              as featurizer. If you use the ``char_wb`` analyzer, you should always get an intent with a confidence
+              value ``> 0.0``.
+
+:Configuration:
+
+    The following hyperparameters can be set:
+
+        - neural network's architecture:
+
+            - ``hidden_layers_sizes.text`` sets a list of hidden layer sizes before
+              the embedding layer for user inputs, the number of hidden layers
+              is equal to the length of the list.
+            - ``hidden_layers_sizes.label`` sets a list of hidden layer sizes before
+              the embedding layer for intent labels, the number of hidden layers
+              is equal to the length of the list.
+            - ``share_hidden_layers`` if set to True, shares the hidden layers between user inputs and intent label.
+            - ``transformer_size`` sets the size of the transformer.
+            - ``number_of_transformer_layers`` sets the number of transformer layers to use.
+            - ``number_of_attention_heads`` sets the number of attention heads to use.
+            - ``unidirectional_encoder`` specifies whether to use a unidirectional or bidirectional encoder.
+            - ``use_key_relative_attention`` if true use key relative embeddings in attention.
+            - ``use_value_relative_attention`` if true use key relative embeddings in attention.
+            - ``max_relative_position`` sets the max position for relative embeddings.
+
+        - training:
+
+            - ``batch_size`` sets the number of training examples in one
+              forward/backward pass, the higher the batch size, the more
+              memory space you'll need.
+            - ``batch_strategy`` sets the type of batching strategy,
+              it should be either ``sequence`` or ``balanced``.
+            - ``epochs`` sets the number of times the algorithm will see
+              training data, where one ``epoch`` equals one forward pass and
+              one backward pass of all the training examples.
+            - ``random_seed`` if set you will get reproducible
+              training results for the same inputs.
+            - ``learning_rate`` sets the initial learning rate of the optimizer.
+
+        - embedding:
+
+            - ``dense_dimension.text`` sets the dense dimensions for user inputs to use for sparse
+              tensors if no dense features are present.
+            - ``dense_dimension.label`` sets the dense dimensions for intent labels to use for sparse
+              tensors if no dense features are present.
+            - ``embedding_dimension`` sets the dimension of embedding space.
+            - ``number_of_negative_examples`` sets the number of incorrect intent labels.
+              The algorithm will minimize their similarity to the user
+              input during training.
+            - ``similarity_type`` sets the type of the similarity,
+              it should be either ``auto``, ``cosine`` or ``inner``,
+              if ``auto``, it will be set depending on ``loss_type``,
+              ``inner`` for ``softmax``, ``cosine`` for ``margin``.
+            - ``loss_type`` sets the type of the loss function,
+              it should be either ``softmax`` or ``margin``.
+            - ``ranking_length`` defines the number of top confidences over
+              which to normalize ranking results if ``loss_type: "softmax"``.
+              To turn off normalization set it to 0.
+            - ``maximum_positive_similarity`` controls how similar the algorithm should try
+              to make embedding vectors for correct intent labels,
+              used only if ``loss_type`` is set to ``margin``.
+            - ``maximum_negative_similarity`` controls maximum negative similarity for
+              incorrect intents, used only if ``loss_type`` is set to ``margin``.
+            - ``use_maximum_negative_similarity`` if ``true`` the algorithm only
+              minimizes maximum similarity over incorrect intent labels,
+              used only if ``loss_type`` is set to ``margin``.
+            - ``scale_loss`` if ``true`` the algorithm will downscale the loss
+              for examples where correct label is predicted with high confidence,
+              used only if ``loss_type`` is set to ``softmax``.
+
+        - regularization:
+
+            - ``regularization_constant`` sets the scale of L2 regularization. Higher values will result in more
+              regularization.
+            - ``negative_margin_scale`` sets the scale of how important is to minimize
+              the maximum similarity between embeddings of different intent labels.
+            - ``drop_rate`` sets the dropout rate, it should be
+              between ``0`` and ``1``, e.g. ``drop_rate=0.1`` would drop out ``10%`` of input units.
+            - ``drop_rate_attention`` sets the dropout rate for attention, it should be
+              between ``0`` and ``1``, e.g. ``drop_rate_attention=0.1`` would drop out ``10%`` of input units.
+            - ``weight_sparsity`` sets the sparsity of weight kernels in dense layers.
+            - ``use_sparse_input_dropout`` specifies whether to apply dropout to sparse tensors or not.
+
+        - model configuration:
+
+            - ``use_masked_language_model`` specifies whether to apply masking or not.
+            - ``intent_classification`` indicates whether intent classification should be performed or not.
+            - ``entity_recognition`` indicates whether entity recognition should be performed or not.
+            - ``BILOU_flag`` determines whether to use BILOU tagging or not.
+
+    .. note:: For ``cosine`` similarity ``maximum_positive_similarity`` and ``maximum_negative_similarity`` should
+              be between ``-1`` and ``1``.
+
+    .. note:: There is an option to use linearly increasing batch size. The idea comes from
+              `<https://arxiv.org/abs/1711.00489>`_.
+              In order to do it pass a list to ``batch_size``, e.g. ``"batch_size": [64, 256]`` (default behaviour).
+              If constant ``batch_size`` is required, pass an ``int``, e.g. ``"batch_size": 64``.
+
+    .. note:: Parameter ``maximum_negative_similarity`` is set to a negative value to mimic the original
+              starspace algorithm in the case ``maximum_negative_similarity = maximum_positive_similarity``
+              and ``use_maximum_negative_similarity = False``.
+              See `starspace paper <https://arxiv.org/abs/1709.03856>`_ for details.
+
+    Default values:
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "DIETClassifier"
+            # ## Architecture of the used neural network
+            # Hidden layer sizes for layers before the embedding layers for user message
+            # and labels.
+            # The number of hidden layers is equal to the length of the corresponding
+            # list.
+            "hidden_layers_sizes": {TEXT: [], LABEL: []}
+            # Whether to share the hidden layer weights between user message and labels.
+            "share_hidden_layers": False
+            # Number of units in transformer
+            "transformer_size": 256
+            # Number of transformer layers
+            "number_of_transformer_layers": 2
+            # Number of attention heads in transformer
+            "number_of_attention_heads": 4
+            # If 'True' use key relative embeddings in attention
+            "use_key_relative_attention": False
+            # If 'True' use key relative embeddings in attention
+            "use_value_relative_attention": False
+            # Max position for relative embeddings
+            "max_relative_position": None
+            # Max sequence length
+            "maximum_sequence_length": 256
+            # Use a unidirectional or bidirectional encoder.
+            "unidirectional_encoder": False
+            # ## Training parameters
+            # Initial and final batch sizes:
+            # Batch size will be linearly increased for each epoch.
+            "batch_size": [64, 256]
+            # Strategy used when creating batches.
+            # Can be either 'sequence' or 'balanced'.
+            "batch_strategy": "balanced"
+            # Number of epochs to train
+            "epochs": 300
+            # Set random seed to any 'int' to get reproducible results
+            "random_seed": None
+            # Initial learning rate for the optimizer
+            "learning_rate": 0.001
+            # ## Parameters for embeddings
+            # Dimension size of embedding vectors
+            "embedding_dimension": 20
+            # Default dense dimension to use if no dense features are present.
+            "dense_dimension": {TEXT: 512, LABEL: 20}
+            # The number of incorrect labels. The algorithm will minimize
+            # their similarity to the user input during training.
+            "number_of_negative_examples": 20
+            # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
+            "similarity_type": "auto"
+            # The type of the loss function, either 'softmax' or 'margin'.
+            "loss_type": "softmax"
+            # Number of top actions to normalize scores for loss type 'softmax'.
+            # Set to 0 to turn off normalization.
+            "ranking_length": 10
+            # Indicates how similar the algorithm should try to make embedding vectors
+            # for correct labels.
+            # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+            "maximum_positive_similarity": 0.8
+            # Maximum negative similarity for incorrect labels.
+            # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+            "maximum_negative_similarity": -0.4
+            # If 'True' the algorithm only minimizes maximum similarity over
+            # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
+            "use_maximum_negative_similarity": True
+            # Scale loss inverse proportionally to confidence of correct prediction
+            "scale_loss": True
+            # ## Regularization parameters
+            # The scale of regularization
+            "regularization_constant": 0.002
+            # The scale of how important is to minimize the maximum similarity
+            # between embeddings of different labels.
+            "negative_margin_scale": 0.8
+            # Sparsity of the weights in dense layers
+            "weight_sparsity": 0.8
+            # Dropout rate for encoder
+            "drop_rate": 0.2
+            # Dropout rate for attention
+            "drop_rate_attention": 0
+            # If 'True' apply dropout to sparse tensors
+            "use_sparse_input_dropout": True
+            # ## Evaluation parameters
+            # How often calculate validation accuracy.
+            # Small values may hurt performance, e.g. model accuracy.
+            "evaluate_every_number_of_epochs": 20
+            # How many examples to use for hold out validation set
+            # Large values may hurt performance, e.g. model accuracy.
+            "evaluate_on_number_of_examples": 0
+            # ## Model config
+            # If 'True' intent classification is trained and intent predicted.
+            "intent_classification": True
+            # If 'True' named entity recognition is trained and entities predicted.
+            "entity_recognition": True
+            # If 'True' random tokens of the input message will be masked and the model
+            # should predict those tokens.
+            "use_masked_language_model": False
+            # 'BILOU_flag' determines whether to use BILOU tagging or not.
+            # If set to 'True' labelling is more rigorous, however more
+            # examples per entity are required.
+            # Rule of thumb: you should have more than 100 examples per entity.
+            "BILOU_flag": True
diff --git a/docs/user-guide/installation.rst b/docs/user-guide/installation.rst
index f13ec328a636..6afeaaaf5a7f 100644
--- a/docs/user-guide/installation.rst
+++ b/docs/user-guide/installation.rst
@@ -218,6 +218,7 @@ and sklearn_crfsuite get automatically installed. However, spaCy and MITIE need
 
         $ pip install -r alt_requirements/requirements_full.txt
 
+.. _install-spacy:
 
 Dependencies for spaCy
 ######################
diff --git a/examples/concertbot/config.yml b/examples/concertbot/config.yml
index 39cbec66c118..14b58dfd276f 100644
--- a/examples/concertbot/config.yml
+++ b/examples/concertbot/config.yml
@@ -1,9 +1,19 @@
 language: en
 
-pipeline: supervised_embeddings
+pipeline:
+  - name: "WhitespaceTokenizer"
+  - name: "RegexFeaturizer"
+  - name: "LexicalSyntacticFeaturizer"
+  - name: "CountVectorsFeaturizer"
+  - name: "CountVectorsFeaturizer"
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: "DIETClassifier"
+  - name: "EntitySynonymMapper"
 
 policies:
-  - name: EmbeddingPolicy
+  - name: TEDPolicy
     max_history: 5
     epochs: 200
     batch_size: 50
diff --git a/examples/formbot/config.yml b/examples/formbot/config.yml
index 3aa0e7577759..e4ef40b93e77 100644
--- a/examples/formbot/config.yml
+++ b/examples/formbot/config.yml
@@ -2,15 +2,16 @@ language: en
 
 pipeline:
   - name: WhitespaceTokenizer
-  - name: CRFEntityExtractor
-  - name: EntitySynonymMapper
+  - name: LexicalSyntacticFeaturizer
   - name: CountVectorsFeaturizer
     token_pattern: (?u)\b\w+\b
-  - name: EmbeddingIntentClassifier
   - name: DucklingHTTPExtractor
     url: http://localhost:8000
     dimensions:
       - number
+  - name: DIETClassifier
+    epochs: 100
+  - name: EntitySynonymMapper
 
 policies:
   - name: FallbackPolicy
diff --git a/examples/knowledgebasebot/config.yml b/examples/knowledgebasebot/config.yml
index 00e51f7ac3a3..092617156b27 100644
--- a/examples/knowledgebasebot/config.yml
+++ b/examples/knowledgebasebot/config.yml
@@ -1,8 +1,20 @@
 language: en
-pipeline: supervised_embeddings
+
+pipeline:
+  - name: "WhitespaceTokenizer"
+  - name: "RegexFeaturizer"
+  - name: "LexicalSyntacticFeaturizer"
+  - name: "CountVectorsFeaturizer"
+  - name: "CountVectorsFeaturizer"
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: "DIETClassifier"
+    epochs: 100
+  - name: "EntitySynonymMapper"
 
 policies:
   - name: MemoizationPolicy
-  - name: EmbeddingPolicy
+  - name: TEDPolicy
     max_history: 5
     epochs: 100
diff --git a/examples/moodbot/config.yml b/examples/moodbot/config.yml
index 2378258b9730..2e1c27bd1c62 100644
--- a/examples/moodbot/config.yml
+++ b/examples/moodbot/config.yml
@@ -1,9 +1,17 @@
 language: en
 
-pipeline: "pretrained_embeddings_spacy"
+pipeline:
+  - name: "SpacyNLP"
+  - name: "SpacyTokenizer"
+  - name: "SpacyFeaturizer"
+  - name: "RegexFeaturizer"
+  - name: "LexicalSyntacticFeaturizer"
+  - name: "DIETClassifier"
+    epochs: 100
+  - name: "EntitySynonymMapper"
 
 policies:
-  - name: EmbeddingPolicy
+  - name: TEDPolicy
     max_history: 5
     epochs: 100
   - name: MemoizationPolicy
diff --git a/examples/restaurantbot/config.yml b/examples/restaurantbot/config.yml
index b06666b8f0dd..9bc91e7df050 100644
--- a/examples/restaurantbot/config.yml
+++ b/examples/restaurantbot/config.yml
@@ -4,26 +4,9 @@ pipeline:
   - name: "SpacyNLP"
   - name: "SpacyTokenizer"
   - name: "SpacyFeaturizer"
-  - name: "SklearnIntentClassifier"
-  - name: "CRFEntityExtractor"
-    features: [
-      ["low", "title", "upper"],
-      [
-        "bias",
-        "low",
-        "prefix5",
-        "prefix2",
-        "suffix5",
-        "suffix3",
-        "suffix2",
-        "upper",
-        "title",
-        "digit",
-        "pattern",
-        "text_dense_features"
-      ],
-      ["low", "title", "upper"],
-    ]
+  - name: "LexicalSyntacticFeaturizer"
+  - name: "DIETClassifier"
+    epochs: 100
   - name: "EntitySynonymMapper"
 
 policies:
diff --git a/rasa/__main__.py b/rasa/__main__.py
index 730df495a618..6f3ca4df0b7e 100644
--- a/rasa/__main__.py
+++ b/rasa/__main__.py
@@ -19,6 +19,7 @@
 from rasa.cli.arguments.default_arguments import add_logging_options
 from rasa.cli.utils import parse_last_positional_argument_as_model_path
 from rasa.utils.common import set_log_level
+import rasa.utils.tensorflow.environment as tf_env
 
 logger = logging.getLogger(__name__)
 
@@ -80,6 +81,8 @@ def main() -> None:
     )
     set_log_level(log_level)
 
+    tf_env.setup_tf_environment()
+
     # insert current path in syspath so custom modules are found
     sys.path.insert(1, os.getcwd())
 
diff --git a/rasa/cli/initial_project/config.yml b/rasa/cli/initial_project/config.yml
index 116158293b14..e22e49c76008 100644
--- a/rasa/cli/initial_project/config.yml
+++ b/rasa/cli/initial_project/config.yml
@@ -1,13 +1,26 @@
 # Configuration for Rasa NLU.
 # https://rasa.com/docs/rasa/nlu/components/
 language: en
-pipeline: supervised_embeddings
+pipeline:
+  - name: WhitespaceTokenizer
+  - name: RegexFeaturizer
+  - name: LexicalSyntacticFeaturizer
+  - name: CountVectorsFeaturizer
+  - name: CountVectorsFeaturizer
+    analyzer: "char_wb"
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+    epochs: 100
+  - name: EntitySynonymMapper
+  - name: ResponseSelector
+    epochs: 100
 
 # Configuration for Rasa Core.
 # https://rasa.com/docs/rasa/core/policies/
 policies:
   - name: MemoizationPolicy
-  - name: EmbeddingPolicy
+  - name: TEDPolicy
     max_history: 5
     epochs: 100
   - name: MappingPolicy
diff --git a/rasa/constants.py b/rasa/constants.py
index 75ba1b9547e2..2c4c61fd6394 100644
--- a/rasa/constants.py
+++ b/rasa/constants.py
@@ -39,6 +39,7 @@
 DOCS_URL_COMPONENTS = DOCS_BASE_URL + "/nlu/components/"
 DOCS_URL_TRAINING_DATA_NLU = DOCS_BASE_URL + "/nlu/training-data-format/"
 DOCS_URL_MIGRATE_GOOGLE = DOCS_BASE_URL + "/migrate-from/google-dialogflow-to-rasa/"
+DOCS_URL_MIGRATION_GUIDE = DOCS_BASE_URL + "/migration-guide/"
 
 DOCS_BASE_URL_RASA_X = "https://rasa.com/docs/rasa-x"
 
@@ -48,7 +49,7 @@
 CONFIG_MANDATORY_KEYS_NLU = ["language", "pipeline"]
 CONFIG_MANDATORY_KEYS = CONFIG_MANDATORY_KEYS_CORE + CONFIG_MANDATORY_KEYS_NLU
 
-MINIMUM_COMPATIBLE_VERSION = "1.6.0a2"
+MINIMUM_COMPATIBLE_VERSION = "1.8.0a1"
 
 GLOBAL_USER_CONFIG_PATH = os.path.expanduser("~/.config/rasa/global.yml")
 
@@ -64,3 +65,7 @@
 
 DEFAULT_SESSION_EXPIRATION_TIME_IN_MINUTES = 60
 DEFAULT_CARRY_OVER_SLOTS_TO_NEW_SESSION = True
+
+ENV_GPU_CONFIG = "TF_GPU_MEMORY_ALLOC"
+ENV_CPU_INTER_OP_CONFIG = "TF_INTER_OP_PARALLELISM_THREADS"
+ENV_CPU_INTRA_OP_CONFIG = "TF_INTRA_OP_PARALLELISM_THREADS"
diff --git a/rasa/core/constants.py b/rasa/core/constants.py
index a4b2799b523c..b4d45ecca515 100644
--- a/rasa/core/constants.py
+++ b/rasa/core/constants.py
@@ -57,6 +57,7 @@
 UTTER_PREFIX = "utter_"
 RESPOND_PREFIX = "respond_"
 
+DIALOGUE = "dialogue"
 DEFAULT_CATEGORICAL_SLOT_VALUE = "__other__"
 
 # RabbitMQ message property header added to events published using `rasa export`
diff --git a/rasa/core/interpreter.py b/rasa/core/interpreter.py
index afbcca89644f..d57d17ad587d 100644
--- a/rasa/core/interpreter.py
+++ b/rasa/core/interpreter.py
@@ -218,7 +218,7 @@ async def parse(
         return result if result is not None else default_return
 
     async def _rasa_http_parse(
-        self, text: Text, message_id: Optional[Text] = None,
+        self, text: Text, message_id: Optional[Text] = None
     ) -> Optional[Dict[Text, Any]]:
         """Send a text message to a running rasa NLU http server.
         Return `None` on failure."""
diff --git a/rasa/core/lock_store.py b/rasa/core/lock_store.py
index a1cf994a4762..8c234341bc9a 100644
--- a/rasa/core/lock_store.py
+++ b/rasa/core/lock_store.py
@@ -97,7 +97,7 @@ async def lock(
             self.cleanup(conversation_id, ticket)
 
     async def _acquire_lock(
-        self, conversation_id: Text, ticket: int, wait_time_in_seconds: float,
+        self, conversation_id: Text, ticket: int, wait_time_in_seconds: float
     ) -> TicketLock:
 
         while True:
diff --git a/rasa/core/nlg/generator.py b/rasa/core/nlg/generator.py
index 6b09fa5fef9b..93e119f1210d 100644
--- a/rasa/core/nlg/generator.py
+++ b/rasa/core/nlg/generator.py
@@ -40,7 +40,7 @@ def create(
 
 
 def _create_from_endpoint_config(
-    endpoint_config: Optional[EndpointConfig] = None, domain: Optional[Domain] = None,
+    endpoint_config: Optional[EndpointConfig] = None, domain: Optional[Domain] = None
 ) -> "NaturalLanguageGenerator":
     """Given an endpoint configuration, create a proper NLG object."""
 
diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 90ccebb4f800..b3ad427db0ca 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -1,658 +1,160 @@
-import copy
-import json
 import logging
-import os
-import pickle
-
-import numpy as np
-from typing import Any, List, Optional, Text, Dict, Tuple
-
-import rasa.utils.io
-from rasa.core.domain import Domain
-from rasa.core.featurizers import (
-    TrackerFeaturizer,
-    FullDialogueTrackerFeaturizer,
-    LabelTokenizerSingleStateFeaturizer,
-    MaxHistoryTrackerFeaturizer,
+from typing import Any, Dict, Optional, Text
+
+from rasa.constants import DOCS_URL_MIGRATION_GUIDE
+from rasa.core.constants import DEFAULT_POLICY_PRIORITY, DIALOGUE
+from rasa.core.featurizers import TrackerFeaturizer
+from rasa.core.policies.ted_policy import TEDPolicy
+from rasa.utils.tensorflow.constants import (
+    LABEL,
+    HIDDEN_LAYERS_SIZES,
+    TRANSFORMER_SIZE,
+    NUM_TRANSFORMER_LAYERS,
+    NUM_HEADS,
+    BATCH_SIZES,
+    BATCH_STRATEGY,
+    EPOCHS,
+    RANDOM_SEED,
+    RANKING_LENGTH,
+    LOSS_TYPE,
+    SIMILARITY_TYPE,
+    NUM_NEG,
+    EVAL_NUM_EXAMPLES,
+    EVAL_NUM_EPOCHS,
+    NEGATIVE_MARGIN_SCALE,
+    REGULARIZATION_CONSTANT,
+    SCALE_LOSS,
+    USE_MAX_NEG_SIM,
+    MAX_NEG_SIM,
+    MAX_POS_SIM,
+    EMBEDDING_DIMENSION,
+    DROP_RATE_DIALOGUE,
+    DROP_RATE_LABEL,
+    DROP_RATE_ATTENTION,
+    WEIGHT_SPARSITY,
+    KEY_RELATIVE_ATTENTION,
+    VALUE_RELATIVE_ATTENTION,
+    MAX_RELATIVE_POSITION,
+    SOFTMAX,
+    AUTO,
+    BALANCED,
 )
-from rasa.core.policies.policy import Policy
-from rasa.core.constants import DEFAULT_POLICY_PRIORITY
-from rasa.core.trackers import DialogueStateTracker
-from rasa.utils import train_utils
-
-import tensorflow as tf
-
-# avoid warning println on contrib import - remove for tf 2
-from rasa.utils.common import raise_warning
+from rasa.utils.tensorflow.models import RasaModel
+import rasa.utils.common as common_utils
 
-tf.contrib._warning = None
 logger = logging.getLogger(__name__)
 
 
-class EmbeddingPolicy(Policy):
-    """Transformer Embedding Dialogue Policy (TEDP)
-
-    Transformer version of the REDP used in our paper https://arxiv.org/abs/1811.11707
+class EmbeddingPolicy(TEDPolicy):
+    """Transformer Embedding Dialogue (TED) Policy is described in
+    https://arxiv.org/abs/1910.00486.
+
+    This policy has a pre-defined architecture, which comprises the
+    following steps:
+        - concatenate user input (user intent and entities), previous system actions,
+          slots and active forms for each time step into an input vector to
+          pre-transformer embedding layer;
+        - feed it to transformer;
+        - apply a dense layer to the output of the transformer to get embeddings of a
+          dialogue for each time step;
+        - apply a dense layer to create embeddings for system actions for each time
+          step;
+        - calculate the similarity between the dialogue embedding and embedded system
+          actions. This step is based on the StarSpace
+          (https://arxiv.org/abs/1709.03856) idea.
     """
 
-    SUPPORTS_ONLINE_TRAINING = True
-
-    # default properties (DOC MARKER - don't remove)
+    # please make sure to update the docs when changing a default parameter
     defaults = {
-        # nn architecture
-        # a list of hidden layers sizes before user embed layer
-        # number of hidden layers is equal to the length of this list
-        "hidden_layers_sizes_pre_dial": [],
-        # a list of hidden layers sizes before bot embed layer
-        # number of hidden layers is equal to the length of this list
-        "hidden_layers_sizes_bot": [],
-        # number of units in transformer
-        "transformer_size": 128,
-        # number of transformer layers
-        "num_transformer_layers": 1,
-        # type of positional encoding in transformer
-        "pos_encoding": "timing",  # string 'timing' or 'emb'
-        # max sequence length if pos_encoding='emb'
-        "max_seq_length": 256,
-        # number of attention heads in transformer
-        "num_heads": 4,
-        # training parameters
-        # initial and final batch sizes:
-        # batch size will be linearly increased for each epoch
-        "batch_size": [8, 32],
-        # how to create batches
-        "batch_strategy": "balanced",  # string 'sequence' or 'balanced'
-        # number of epochs
-        "epochs": 1,
-        # set random seed to any int to get reproducible results
-        "random_seed": None,
-        # embedding parameters
-        # dimension size of embedding vectors
-        "embed_dim": 20,
-        # the type of the similarity
-        "num_neg": 20,
-        # flag if minimize only maximum similarity over incorrect labels
-        "similarity_type": "auto",  # string 'auto' or 'cosine' or 'inner'
-        # the type of the loss function
-        "loss_type": "softmax",  # string 'softmax' or 'margin'
-        # number of top actions to normalize scores for softmax loss_type
-        # set to 0 to turn off normalization
-        "ranking_length": 10,
-        # how similar the algorithm should try
-        # to make embedding vectors for correct labels
-        "mu_pos": 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
-        # maximum negative similarity for incorrect labels
-        "mu_neg": -0.2,  # should be -1.0 < ... < 1.0 for 'cosine'
-        # the number of incorrect labels, the algorithm will minimize
-        # their similarity to the user input during training
-        "use_max_sim_neg": True,  # flag which loss function to use
-        # scale loss inverse proportionally to confidence of correct prediction
-        "scale_loss": True,
-        # regularization
-        # the scale of L2 regularization
-        "C2": 0.001,
-        # the scale of how important is to minimize the maximum similarity
-        # between embeddings of different labels
-        "C_emb": 0.8,
-        # dropout rate for dial nn
-        "droprate_a": 0.1,
-        # dropout rate for bot nn
-        "droprate_b": 0.0,
-        # visualization of accuracy
-        # how often calculate validation accuracy
-        "evaluate_every_num_epochs": 20,  # small values may hurt performance
-        # how many examples to use for hold out validation set
-        "evaluate_on_num_examples": 0,  # large values may hurt performance
+        # ## Architecture of the used neural network
+        # Hidden layer sizes for layers before the dialogue and label embedding layers.
+        # The number of hidden layers is equal to the length of the corresponding
+        # list.
+        HIDDEN_LAYERS_SIZES: {DIALOGUE: [], LABEL: []},
+        # Number of units in transformer
+        TRANSFORMER_SIZE: 128,
+        # Number of transformer layers
+        NUM_TRANSFORMER_LAYERS: 1,
+        # If 'True' use key relative embeddings in attention
+        KEY_RELATIVE_ATTENTION: False,
+        # If 'True' use key relative embeddings in attention
+        VALUE_RELATIVE_ATTENTION: False,
+        # Max position for relative embeddings
+        MAX_RELATIVE_POSITION: None,
+        # Number of attention heads in transformer
+        NUM_HEADS: 4,
+        # ## Training parameters
+        # Initial and final batch sizes:
+        # Batch size will be linearly increased for each epoch.
+        BATCH_SIZES: [8, 32],
+        # Strategy used when creating batches.
+        # Can be either 'sequence' or 'balanced'.
+        BATCH_STRATEGY: BALANCED,
+        # Number of epochs to train
+        EPOCHS: 1,
+        # Set random seed to any 'int' to get reproducible results
+        RANDOM_SEED: None,
+        # ## Parameters for embeddings
+        # Dimension size of embedding vectors
+        EMBEDDING_DIMENSION: 20,
+        # The number of incorrect labels. The algorithm will minimize
+        # their similarity to the user input during training.
+        NUM_NEG: 20,
+        # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
+        SIMILARITY_TYPE: AUTO,
+        # The type of the loss function, either 'softmax' or 'margin'.
+        LOSS_TYPE: SOFTMAX,
+        # Number of top actions to normalize scores for loss type 'softmax'.
+        # Set to 0 to turn off normalization.
+        RANKING_LENGTH: 10,
+        # Indicates how similar the algorithm should try to make embedding vectors
+        # for correct labels.
+        # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_POS_SIM: 0.8,
+        # Maximum negative similarity for incorrect labels.
+        # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_NEG_SIM: -0.2,
+        # If 'True' the algorithm only minimizes maximum similarity over
+        # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
+        USE_MAX_NEG_SIM: True,
+        # Scale loss inverse proportionally to confidence of correct prediction
+        SCALE_LOSS: True,
+        # ## Regularization parameters
+        # The scale of regularization
+        REGULARIZATION_CONSTANT: 0.001,
+        # The scale of how important is to minimize the maximum similarity
+        # between embeddings of different labels.
+        NEGATIVE_MARGIN_SCALE: 0.8,
+        # Dropout rate for embedding layers of dialogue features.
+        DROP_RATE_DIALOGUE: 0.1,
+        # Dropout rate for embedding layers of label, e.g. action, features.
+        DROP_RATE_LABEL: 0.0,
+        # Dropout rate for attention.
+        DROP_RATE_ATTENTION: 0,
+        # Sparsity of the weights in dense layers
+        WEIGHT_SPARSITY: 0.8,
+        # ## Evaluation parameters
+        # How often calculate validation accuracy.
+        # Small values may hurt performance, e.g. model accuracy.
+        EVAL_NUM_EPOCHS: 20,
+        # How many examples to use for hold out validation set
+        # Large values may hurt performance, e.g. model accuracy.
+        EVAL_NUM_EXAMPLES: 0,
     }
-    # end default properties (DOC MARKER - don't remove)
-
-    @staticmethod
-    def _standard_featurizer(max_history: Optional[int] = None) -> "TrackerFeaturizer":
-        if max_history is None:
-            return FullDialogueTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer())
-        else:
-            return MaxHistoryTrackerFeaturizer(
-                LabelTokenizerSingleStateFeaturizer(), max_history=max_history
-            )
 
     def __init__(
         self,
-        featurizer: Optional["TrackerFeaturizer"] = None,
+        featurizer: Optional[TrackerFeaturizer] = None,
         priority: int = DEFAULT_POLICY_PRIORITY,
-        graph: Optional["tf.Graph"] = None,
-        session: Optional["tf.Session"] = None,
-        user_placeholder: Optional["tf.Tensor"] = None,
-        bot_placeholder: Optional["tf.Tensor"] = None,
-        similarity_all: Optional["tf.Tensor"] = None,
-        pred_confidence: Optional["tf.Tensor"] = None,
-        similarity: Optional["tf.Tensor"] = None,
-        dial_embed: Optional["tf.Tensor"] = None,
-        bot_embed: Optional["tf.Tensor"] = None,
-        all_bot_embed: Optional["tf.Tensor"] = None,
-        attention_weights: Optional["tf.Tensor"] = None,
         max_history: Optional[int] = None,
-        **kwargs: Any,
-    ) -> None:
-        """Declare instant variables with default values"""
-
-        if not featurizer:
-            featurizer = self._standard_featurizer(max_history)
-        super().__init__(featurizer, priority)
-
-        self._load_params(**kwargs)
-
-        # encode all label_ids with numbers
-        self._encoded_all_label_ids = None
-
-        # tf related instances
-        self.graph = graph
-        self.session = session
-        self.a_in = user_placeholder
-        self.b_in = bot_placeholder
-        self.sim_all = similarity_all
-        self.pred_confidence = pred_confidence
-        self.sim = similarity
-
-        # persisted embeddings
-        self.dial_embed = dial_embed
-        self.bot_embed = bot_embed
-        self.all_bot_embed = all_bot_embed
-
-        self.attention_weights = attention_weights
-        # internal tf instances
-        self._iterator = None
-        self._train_op = None
-        self._is_training = None
-
-    # init helpers
-    def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
-        self.hidden_layers_sizes = {
-            "pre_dial": config["hidden_layers_sizes_pre_dial"],
-            "bot": config["hidden_layers_sizes_bot"],
-        }
-
-        self.pos_encoding = config["pos_encoding"]
-        self.max_seq_length = config["max_seq_length"]
-        self.num_heads = config["num_heads"]
-
-        self.transformer_size = config["transformer_size"]
-        self.num_transformer_layers = config["num_transformer_layers"]
-
-        self.batch_size = config["batch_size"]
-        self.batch_strategy = config["batch_strategy"]
-
-        self.epochs = config["epochs"]
-
-        self.random_seed = config["random_seed"]
-
-    def _load_embedding_params(self, config: Dict[Text, Any]) -> None:
-        self.embed_dim = config["embed_dim"]
-        self.num_neg = config["num_neg"]
-
-        self.similarity_type = config["similarity_type"]
-        self.loss_type = config["loss_type"]
-        if self.similarity_type == "auto":
-            if self.loss_type == "softmax":
-                self.similarity_type = "inner"
-            elif self.loss_type == "margin":
-                self.similarity_type = "cosine"
-        self.ranking_length = config["ranking_length"]
-
-        self.mu_pos = config["mu_pos"]
-        self.mu_neg = config["mu_neg"]
-        self.use_max_sim_neg = config["use_max_sim_neg"]
-
-        self.scale_loss = config["scale_loss"]
-
-    def _load_regularization_params(self, config: Dict[Text, Any]) -> None:
-        self.C2 = config["C2"]
-        self.C_emb = config["C_emb"]
-        self.droprate = {"bot": config["droprate_b"], "dial": config["droprate_a"]}
-
-    def _load_visual_params(self, config: Dict[Text, Any]) -> None:
-        self.evaluate_every_num_epochs = config["evaluate_every_num_epochs"]
-        if self.evaluate_every_num_epochs < 1:
-            self.evaluate_every_num_epochs = self.epochs
-        self.evaluate_on_num_examples = config["evaluate_on_num_examples"]
-
-    def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
-        config = copy.deepcopy(self.defaults)
-        config.update(kwargs)
-
-        self._tf_config = train_utils.load_tf_config(config)
-        self._load_nn_architecture_params(config)
-        self._load_embedding_params(config)
-        self._load_regularization_params(config)
-        self._load_visual_params(config)
-
-    # data helpers
-    # noinspection PyPep8Naming
-    @staticmethod
-    def _label_ids_for_Y(data_Y: "np.ndarray") -> "np.ndarray":
-        """Prepare Y data for training: extract label_ids."""
-
-        return data_Y.argmax(axis=-1)
-
-    # noinspection PyPep8Naming
-    def _label_features_for_Y(self, label_ids: "np.ndarray") -> "np.ndarray":
-        """Prepare Y data for training: features for label_ids."""
-
-        if len(label_ids.shape) == 2:  # full dialogue featurizer is used
-            return np.stack(
-                [
-                    np.stack(
-                        [
-                            self._encoded_all_label_ids[label_idx]
-                            for label_idx in seq_label_ids
-                        ]
-                    )
-                    for seq_label_ids in label_ids
-                ]
-            )
-        else:  # max history featurizer is used
-            return np.stack(
-                [self._encoded_all_label_ids[label_idx] for label_idx in label_ids]
-            )
-
-    # noinspection PyPep8Naming
-    def _create_session_data(
-        self, data_X: "np.ndarray", data_Y: Optional["np.ndarray"] = None
-    ) -> "train_utils.SessionDataType":
-        """Combine all tf session related data into dict."""
-        if data_Y is not None:
-            # training time
-            label_ids = self._label_ids_for_Y(data_Y)
-            Y = self._label_features_for_Y(label_ids)
-            # explicitly add last dimension to label_ids
-            # to track correctly dynamic sequences
-            label_ids = np.expand_dims(label_ids, -1)
-        else:
-            # prediction time
-            label_ids = np.asarray([])
-            Y = np.asarray([])
-
-        return {
-            "dialogue_features": [data_X],
-            "bot_features": [Y],
-            "action_ids": [label_ids],
-        }
-
-    def _create_tf_bot_embed(self, b_in: "tf.Tensor") -> "tf.Tensor":
-        """Create embedding bot vector."""
-
-        b = train_utils.create_tf_fnn(
-            b_in,
-            self.hidden_layers_sizes["bot"],
-            self.droprate["bot"],
-            self.C2,
-            self._is_training,
-            layer_name_suffix="bot",
-        )
-        return train_utils.create_tf_embed(
-            b, self.embed_dim, self.C2, self.similarity_type, layer_name_suffix="bot"
-        )
-
-    def _create_tf_dial(self, a_in) -> Tuple["tf.Tensor", "tf.Tensor"]:
-        """Create dialogue level embedding and mask."""
-
-        # mask different length sequences
-        # if there is at least one `-1` it should be masked
-        mask = tf.sign(tf.reduce_max(self.a_in, -1) + 1)
-
-        a = train_utils.create_tf_fnn(
-            a_in,
-            self.hidden_layers_sizes["pre_dial"],
-            self.droprate["dial"],
-            self.C2,
-            self._is_training,
-            layer_name_suffix="pre_dial",
-        )
-
-        self.attention_weights = {}
-        hparams = train_utils.create_t2t_hparams(
-            self.num_transformer_layers,
-            self.transformer_size,
-            self.num_heads,
-            self.droprate["dial"],
-            self.pos_encoding,
-            self.max_seq_length,
-            self._is_training,
-        )
-
-        a = train_utils.create_t2t_transformer_encoder(
-            a, mask, self.attention_weights, hparams, self.C2, self._is_training
-        )
-
-        if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
-            # pick last label if max history featurizer is used
-            a = a[:, -1:, :]
-            mask = mask[:, -1:]
-
-        dial_embed = train_utils.create_tf_embed(
-            a, self.embed_dim, self.C2, self.similarity_type, layer_name_suffix="dial"
-        )
-
-        return dial_embed, mask
-
-    def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
-        """Bulid train graph using iterator."""
-        # iterator returns a_in, b_in, action_ids
-        self.a_in, self.b_in, _ = self._iterator.get_next()
-
-        if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
-            # add time dimension if max history featurizer is used
-            self.b_in = self.b_in[:, tf.newaxis, :]
-
-        all_bot_raw = tf.constant(
-            self._encoded_all_label_ids, dtype=tf.float32, name="all_bot_raw"
-        )
-
-        self.dial_embed, mask = self._create_tf_dial(self.a_in)
-
-        self.bot_embed = self._create_tf_bot_embed(self.b_in)
-        self.all_bot_embed = self._create_tf_bot_embed(all_bot_raw)
-
-        return train_utils.calculate_loss_acc(
-            self.dial_embed,
-            self.bot_embed,
-            self.b_in,
-            self.all_bot_embed,
-            all_bot_raw,
-            self.num_neg,
-            mask,
-            self.loss_type,
-            self.mu_pos,
-            self.mu_neg,
-            self.use_max_sim_neg,
-            self.C_emb,
-            self.scale_loss,
-        )
-
-    # prepare for prediction
-    def _create_tf_placeholders(
-        self, session_data: "train_utils.SessionDataType"
-    ) -> None:
-        """Create placeholders for prediction."""
-
-        dialogue_len = None  # use dynamic time
-        self.a_in = tf.placeholder(
-            dtype=tf.float32,
-            shape=(None, dialogue_len, session_data["dialogue_features"][0].shape[-1]),
-            name="a",
-        )
-        self.b_in = tf.placeholder(
-            dtype=tf.float32,
-            shape=(None, dialogue_len, None, session_data["bot_features"][0].shape[-1]),
-            name="b",
-        )
-
-    def _build_tf_pred_graph(
-        self, session_data: "train_utils.SessionDataType"
-    ) -> "tf.Tensor":
-        """Rebuild tf graph for prediction."""
-
-        self._create_tf_placeholders(session_data)
-
-        self.dial_embed, mask = self._create_tf_dial(self.a_in)
-
-        self.sim_all = train_utils.tf_raw_sim(
-            self.dial_embed[:, :, tf.newaxis, :],
-            self.all_bot_embed[tf.newaxis, tf.newaxis, :, :],
-            mask,
-        )
-
-        self.bot_embed = self._create_tf_bot_embed(self.b_in)
-
-        self.sim = train_utils.tf_raw_sim(
-            self.dial_embed[:, :, tf.newaxis, :], self.bot_embed, mask
-        )
-
-        return train_utils.confidence_from_sim(self.sim_all, self.similarity_type)
-
-    # training methods
-    def train(
-        self,
-        training_trackers: List["DialogueStateTracker"],
-        domain: "Domain",
-        **kwargs: Any,
+        model: Optional[RasaModel] = None,
+        **kwargs: Dict[Text, Any],
     ) -> None:
-        """Train the policy on given training trackers."""
-
-        logger.debug("Started training embedding policy.")
-
-        # set numpy random seed
-        np.random.seed(self.random_seed)
-
-        # dealing with training data
-        training_data = self.featurize_for_training(training_trackers, domain, **kwargs)
-
-        # encode all label_ids with policies' featurizer
-        state_featurizer = self.featurizer.state_featurizer
-        self._encoded_all_label_ids = state_featurizer.create_encoded_all_actions(
-            domain
-        )
-
-        # check if number of negatives is less than number of label_ids
-        logger.debug(
-            "Check if num_neg {} is smaller "
-            "than number of label_ids {}, "
-            "else set num_neg to the number of label_ids - 1"
-            "".format(self.num_neg, domain.num_actions)
-        )
-        # noinspection PyAttributeOutsideInit
-        self.num_neg = min(self.num_neg, domain.num_actions - 1)
-
-        # extract actual training data to feed to tf session
-        session_data = self._create_session_data(training_data.X, training_data.y)
-
-        if self.evaluate_on_num_examples:
-            session_data, eval_session_data = train_utils.train_val_split(
-                session_data,
-                self.evaluate_on_num_examples,
-                self.random_seed,
-                label_key="action_ids",
-            )
-        else:
-            eval_session_data = None
-
-        self.graph = tf.Graph()
-        with self.graph.as_default():
-            # set random seed in tf
-            tf.set_random_seed(self.random_seed)
-
-            # allows increasing batch size
-            batch_size_in = tf.placeholder(tf.int64)
-
-            (
-                self._iterator,
-                train_init_op,
-                eval_init_op,
-            ) = train_utils.create_iterator_init_datasets(
-                session_data,
-                eval_session_data,
-                batch_size_in,
-                self.batch_strategy,
-                label_key="action_ids",
-            )
-
-            self._is_training = tf.placeholder_with_default(False, shape=())
-
-            loss, acc = self._build_tf_train_graph()
-
-            # define which optimizer to use
-            self._train_op = tf.train.AdamOptimizer().minimize(loss)
-
-            # train tensorflow graph
-            self.session = tf.Session(config=self._tf_config)
-            train_utils.train_tf_dataset(
-                train_init_op,
-                eval_init_op,
-                batch_size_in,
-                loss,
-                acc,
-                self._train_op,
-                self.session,
-                self._is_training,
-                self.epochs,
-                self.batch_size,
-                self.evaluate_on_num_examples,
-                self.evaluate_every_num_epochs,
-            )
-
-            # rebuild the graph for prediction
-            self.pred_confidence = self._build_tf_pred_graph(session_data)
-
-            self.attention_weights = train_utils.extract_attention(
-                self.attention_weights
-            )
-
-    def tf_feed_dict_for_prediction(
-        self, tracker: "DialogueStateTracker", domain: "Domain"
-    ) -> Dict["tf.Tensor", "np.ndarray"]:
-        """Create feed dictionary for tf session."""
-
-        # noinspection PyPep8Naming
-        data_X = self.featurizer.create_X([tracker], domain)
-        session_data = self._create_session_data(data_X)
-
-        return {self.a_in: session_data["dialogue_features"][0]}
-
-    def predict_action_probabilities(
-        self, tracker: "DialogueStateTracker", domain: "Domain"
-    ) -> List[float]:
-        """Predict the next action the bot should take.
-
-        Return the list of probabilities for the next actions.
-        """
-
-        if self.session is None:
-            logger.error(
-                "There is no trained tf.session: "
-                "component is either not trained or "
-                "didn't receive enough training data"
-            )
-            return [0.0] * domain.num_actions
-
-        tf_feed_dict = self.tf_feed_dict_for_prediction(tracker, domain)
-
-        confidence = self.session.run(self.pred_confidence, feed_dict=tf_feed_dict)
-        confidence = confidence[0, -1, :]
-
-        if self.loss_type == "softmax" and self.ranking_length > 0:
-            confidence = train_utils.normalize(confidence, self.ranking_length)
-
-        return confidence.tolist()
-
-    def persist(self, path: Text) -> None:
-        """Persists the policy to a storage."""
-
-        if self.session is None:
-            logger.debug(
-                "Method `persist(...)` was called "
-                "without a trained model present. "
-                "Nothing to persist then!"
-            )
-            return
-
-        self.featurizer.persist(path)
-
-        meta = {
-            "priority": self.priority,
-            "loss_type": self.loss_type,
-            "ranking_length": self.ranking_length,
-        }
-
-        meta_file = os.path.join(path, "embedding_policy.json")
-        rasa.utils.io.dump_obj_as_json_to_file(meta_file, meta)
-
-        file_name = "tensorflow_embedding.ckpt"
-        checkpoint = os.path.join(path, file_name)
-        rasa.utils.io.create_directory_for_file(checkpoint)
-
-        with self.graph.as_default():
-            train_utils.persist_tensor("user_placeholder", self.a_in, self.graph)
-            train_utils.persist_tensor("bot_placeholder", self.b_in, self.graph)
-
-            train_utils.persist_tensor("similarity_all", self.sim_all, self.graph)
-            train_utils.persist_tensor(
-                "pred_confidence", self.pred_confidence, self.graph
-            )
-            train_utils.persist_tensor("similarity", self.sim, self.graph)
-
-            train_utils.persist_tensor("dial_embed", self.dial_embed, self.graph)
-            train_utils.persist_tensor("bot_embed", self.bot_embed, self.graph)
-            train_utils.persist_tensor("all_bot_embed", self.all_bot_embed, self.graph)
-
-            train_utils.persist_tensor(
-                "attention_weights", self.attention_weights, self.graph
-            )
-
-            saver = tf.train.Saver()
-            saver.save(self.session, checkpoint)
-
-        with open(os.path.join(path, file_name + ".tf_config.pkl"), "wb") as f:
-            pickle.dump(self._tf_config, f)
-
-    @classmethod
-    def load(cls, path: Text) -> "EmbeddingPolicy":
-        """Loads a policy from the storage.
-
-        **Needs to load its featurizer**
-        """
-
-        if not os.path.exists(path):
-            raise Exception(
-                "Failed to load dialogue model. Path '{}' "
-                "doesn't exist".format(os.path.abspath(path))
-            )
-
-        featurizer = TrackerFeaturizer.load(path)
-
-        file_name = "tensorflow_embedding.ckpt"
-        checkpoint = os.path.join(path, file_name)
-
-        if not os.path.exists(checkpoint + ".meta"):
-            return cls(featurizer=featurizer)
-
-        meta_file = os.path.join(path, "embedding_policy.json")
-        meta = json.loads(rasa.utils.io.read_file(meta_file))
-
-        with open(os.path.join(path, file_name + ".tf_config.pkl"), "rb") as f:
-            _tf_config = pickle.load(f)
-
-        graph = tf.Graph()
-        with graph.as_default():
-            session = tf.Session(config=_tf_config)
-            saver = tf.train.import_meta_graph(checkpoint + ".meta")
-
-            saver.restore(session, checkpoint)
-
-            a_in = train_utils.load_tensor("user_placeholder")
-            b_in = train_utils.load_tensor("bot_placeholder")
-
-            sim_all = train_utils.load_tensor("similarity_all")
-            pred_confidence = train_utils.load_tensor("pred_confidence")
-            sim = train_utils.load_tensor("similarity")
-
-            dial_embed = train_utils.load_tensor("dial_embed")
-            bot_embed = train_utils.load_tensor("bot_embed")
-            all_bot_embed = train_utils.load_tensor("all_bot_embed")
-
-            attention_weights = train_utils.load_tensor("attention_weights")
+        super().__init__(featurizer, priority, max_history, model, **kwargs)
 
-        return cls(
-            featurizer=featurizer,
-            priority=meta.pop("priority"),
-            graph=graph,
-            session=session,
-            user_placeholder=a_in,
-            bot_placeholder=b_in,
-            similarity_all=sim_all,
-            pred_confidence=pred_confidence,
-            similarity=sim,
-            dial_embed=dial_embed,
-            bot_embed=bot_embed,
-            all_bot_embed=all_bot_embed,
-            attention_weights=attention_weights,
-            **meta,
+        common_utils.raise_warning(
+            f"'EmbeddingPolicy' is deprecated and will be removed in version 2.0. "
+            f"Use 'TEDPolicy' instead.",
+            category=FutureWarning,
+            docs=DOCS_URL_MIGRATION_GUIDE,
         )
diff --git a/rasa/core/policies/fallback.py b/rasa/core/policies/fallback.py
index 1f314ed89611..d700be0f052f 100644
--- a/rasa/core/policies/fallback.py
+++ b/rasa/core/policies/fallback.py
@@ -128,7 +128,7 @@ def fallback_scores(
     ) -> List[float]:
         """Prediction scores used if a fallback is necessary."""
 
-        result = [0.0] * domain.num_actions
+        result = self._default_predictions(domain)
         idx = domain.index_for_action(self.fallback_action_name)
         result[idx] = fallback_score
         return result
@@ -145,7 +145,7 @@ def predict_action_probabilities(
         nlu_data = tracker.latest_message.parse_data
 
         if tracker.latest_action_name == self.fallback_action_name:
-            result = [0.0] * domain.num_actions
+            result = self._default_predictions(domain)
             idx = domain.index_for_action(ACTION_LISTEN_NAME)
             result[idx] = 1.0
 
diff --git a/rasa/core/policies/form_policy.py b/rasa/core/policies/form_policy.py
index f6af451c8d16..3deab0bc4ced 100644
--- a/rasa/core/policies/form_policy.py
+++ b/rasa/core/policies/form_policy.py
@@ -140,7 +140,7 @@ def predict_action_probabilities(
         self, tracker: DialogueStateTracker, domain: Domain
     ) -> List[float]:
         """Predicts the corresponding form action if there is an active form"""
-        result = [0.0] * domain.num_actions
+        result = self._default_predictions(domain)
 
         if tracker.active_form.get("name"):
             logger.debug(
diff --git a/rasa/core/policies/keras_policy.py b/rasa/core/policies/keras_policy.py
index 8a49fe143119..61d60c480949 100644
--- a/rasa/core/policies/keras_policy.py
+++ b/rasa/core/policies/keras_policy.py
@@ -5,7 +5,7 @@
 import tensorflow as tf
 import numpy as np
 import warnings
-from typing import Any, List, Dict, Text, Optional, Tuple
+from typing import Any, List, Dict, Text, Optional, Tuple, Union
 
 import rasa.utils.io
 
@@ -17,12 +17,16 @@
 from rasa.core.featurizers import TrackerFeaturizer
 from rasa.core.policies.policy import Policy
 from rasa.core.trackers import DialogueStateTracker
-from rasa.utils.common import obtain_verbosity
+import rasa.utils.common as common_utils
 from rasa.core.constants import DEFAULT_POLICY_PRIORITY
+from rasa.constants import DOCS_URL_MIGRATION_GUIDE
+
 
 # there are a number of issues with imports from tensorflow. hence the deactivation
 # pytype: disable=import-error
 # pytype: disable=module-attr
+
+
 try:
     import cPickle as pickle
 except ImportError:
@@ -56,8 +60,6 @@ def __init__(
         featurizer: Optional[TrackerFeaturizer] = None,
         priority: int = DEFAULT_POLICY_PRIORITY,
         model: Optional[tf.keras.models.Sequential] = None,
-        graph: Optional[tf.Graph] = None,
-        session: Optional[tf.compat.v1.Session] = None,
         current_epoch: int = 0,
         max_history: Optional[int] = None,
         **kwargs: Any,
@@ -68,21 +70,21 @@ def __init__(
 
         self._load_params(**kwargs)
         self.model = model
-        # by default keras uses default tf graph and global tf session
-        # we are going to either load them or create them in train(...)
-        self.graph = graph
-        self.session = session
 
         self.current_epoch = current_epoch
 
-    def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
-        from rasa.utils.train_utils import load_tf_config
+        common_utils.raise_warning(
+            "'KerasPolicy' is deprecated and will be removed in version "
+            "2.0. Use 'TEDPolicy' instead.",
+            category=FutureWarning,
+            docs=DOCS_URL_MIGRATION_GUIDE,
+        )
 
+    def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         config = copy.deepcopy(self.defaults)
         config.update(kwargs)
 
         # filter out kwargs that are used explicitly
-        self._tf_config = load_tf_config(config)
         self.rnn_size = config.pop("rnn_size")
         self.epochs = config.pop("epochs")
         self.batch_size = config.pop("batch_size")
@@ -151,7 +153,7 @@ def model_architecture(
             loss="categorical_crossentropy", optimizer="rmsprop", metrics=["accuracy"]
         )
 
-        if obtain_verbosity() > 0:
+        if common_utils.obtain_verbosity() > 0:
             model.summary()
 
         return model
@@ -163,48 +165,40 @@ def train(
         **kwargs: Any,
     ) -> None:
 
-        # set numpy random seed
         np.random.seed(self.random_seed)
+        tf.random.set_seed(self.random_seed)
 
         training_data = self.featurize_for_training(training_trackers, domain, **kwargs)
         # noinspection PyPep8Naming
         shuffled_X, shuffled_y = training_data.shuffled_X_y()
 
-        self.graph = tf.Graph()
-        with self.graph.as_default():
-            # set random seed in tf
-            tf.set_random_seed(self.random_seed)
-            self.session = tf.compat.v1.Session(config=self._tf_config)
-
-            with self.session.as_default():
-                if self.model is None:
-                    self.model = self.model_architecture(
-                        shuffled_X.shape[1:], shuffled_y.shape[1:]
-                    )
-
-                logger.info(
-                    "Fitting model with {} total samples and a "
-                    "validation split of {}"
-                    "".format(training_data.num_examples(), self.validation_split)
-                )
+        if self.model is None:
+            self.model = self.model_architecture(
+                shuffled_X.shape[1:], shuffled_y.shape[1:]
+            )
 
-                # filter out kwargs that cannot be passed to fit
-                self._train_params = self._get_valid_params(
-                    self.model.fit, **self._train_params
-                )
+        logger.debug(
+            f"Fitting model with {training_data.num_examples()} total samples and a "
+            f"validation split of {self.validation_split}."
+        )
 
-                self.model.fit(
-                    shuffled_X,
-                    shuffled_y,
-                    epochs=self.epochs,
-                    batch_size=self.batch_size,
-                    shuffle=False,
-                    verbose=obtain_verbosity(),
-                    **self._train_params,
-                )
-                # the default parameter for epochs in keras fit is 1
-                self.current_epoch = self.defaults.get("epochs", 1)
-                logger.info("Done fitting keras policy model")
+        # filter out kwargs that cannot be passed to fit
+        self._train_params = self._get_valid_params(
+            self.model.fit, **self._train_params
+        )
+
+        self.model.fit(
+            shuffled_X,
+            shuffled_y,
+            epochs=self.epochs,
+            batch_size=self.batch_size,
+            shuffle=False,
+            verbose=common_utils.obtain_verbosity(),
+            **self._train_params,
+        )
+        self.current_epoch = self.epochs
+
+        logger.debug("Done fitting Keras Policy model.")
 
     def predict_action_probabilities(
         self, tracker: DialogueStateTracker, domain: Domain
@@ -213,8 +207,7 @@ def predict_action_probabilities(
         # noinspection PyPep8Naming
         X = self.featurizer.create_X([tracker], domain)
 
-        with self.graph.as_default(), self.session.as_default():
-            y_pred = self.model.predict(X, batch_size=1)
+        y_pred = self.model.predict(X, batch_size=1)
 
         if len(y_pred.shape) == 2:
             return y_pred[-1].tolist()
@@ -240,12 +233,8 @@ def persist(self, path: Text) -> None:
             model_file = os.path.join(path, meta["model"])
             # makes sure the model directory exists
             rasa.utils.io.create_directory_for_file(model_file)
-            with self.graph.as_default(), self.session.as_default():
-                self.model.save(model_file, overwrite=True)
+            self.model.save(model_file, overwrite=True)
 
-            tf_config_file = os.path.join(path, "keras_policy.tf_config.pkl")
-            with open(tf_config_file, "wb") as f:
-                pickle.dump(self._tf_config, f)
         else:
             logger.debug(
                 "Method `persist(...)` was called "
@@ -263,26 +252,16 @@ def load(cls, path: Text) -> "KerasPolicy":
             if os.path.isfile(meta_file):
                 meta = json.loads(rasa.utils.io.read_file(meta_file))
 
-                tf_config_file = os.path.join(path, "keras_policy.tf_config.pkl")
-                with open(tf_config_file, "rb") as f:
-                    _tf_config = pickle.load(f)
-
                 model_file = os.path.join(path, meta["model"])
 
-                graph = tf.Graph()
-                with graph.as_default():
-                    session = tf.compat.v1.Session(config=_tf_config)
-                    with session.as_default():
-                        with warnings.catch_warnings():
-                            warnings.simplefilter("ignore")
-                            model = load_model(model_file)
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore")
+                    model = load_model(model_file)
 
                 return cls(
                     featurizer=featurizer,
                     priority=meta["priority"],
                     model=model,
-                    graph=graph,
-                    session=session,
                     current_epoch=meta["epochs"],
                 )
             else:
diff --git a/rasa/core/policies/mapping_policy.py b/rasa/core/policies/mapping_policy.py
index ebf92aa14401..413aed58ac9f 100644
--- a/rasa/core/policies/mapping_policy.py
+++ b/rasa/core/policies/mapping_policy.py
@@ -91,7 +91,7 @@ def predict_action_probabilities(
         predicted with the highest probability of all policies. If it is not
         the policy will predict zero for every action."""
 
-        prediction = [0.0] * domain.num_actions
+        prediction = self._default_predictions(domain)
         intent = tracker.latest_message.intent.get("name")
         if intent == USER_INTENT_RESTART:
             action = ACTION_RESTART_NAME
diff --git a/rasa/core/policies/policy.py b/rasa/core/policies/policy.py
index 16ffd1bc4d9a..7527a2489144 100644
--- a/rasa/core/policies/policy.py
+++ b/rasa/core/policies/policy.py
@@ -45,7 +45,15 @@ def featurizer(self):
 
     @staticmethod
     def _get_valid_params(func: Callable, **kwargs: Any) -> Dict:
-        # filter out kwargs that cannot be passed to func
+        """Filters out kwargs that cannot be passed to func.
+
+        Args:
+            func: a callable function
+
+        Returns:
+            the dictionary of parameters
+        """
+
         valid_keys = rasa.utils.common.arguments_of(func)
 
         params = {key: kwargs.get(key) for key in valid_keys if kwargs.get(key)}
@@ -62,8 +70,18 @@ def featurize_for_training(
         **kwargs: Any,
     ) -> DialogueTrainingData:
         """Transform training trackers into a vector representation.
+
         The trackers, consisting of multiple turns, will be transformed
-        into a float vector which can be used by a ML model."""
+        into a float vector which can be used by a ML model.
+
+        Args:
+            training_trackers:
+                the list of the :class:`rasa.core.trackers.DialogueStateTracker`
+            domain: the :class:`rasa.core.domain.Domain`
+
+        Returns:
+            the :class:`rasa.core.training.data.DialogueTrainingData`
+        """
 
         training_data = self.featurizer.featurize_trackers(training_trackers, domain)
 
@@ -83,42 +101,79 @@ def train(
         domain: Domain,
         **kwargs: Any,
     ) -> None:
-        """Trains the policy on given training trackers."""
+        """Trains the policy on given training trackers.
+
+        Args:
+            training_trackers:
+                the list of the :class:`rasa.core.trackers.DialogueStateTracker`
+            domain: the :class:`rasa.core.domain.Domain`
+        """
 
         raise NotImplementedError("Policy must have the capacity to train.")
 
     def predict_action_probabilities(
         self, tracker: DialogueStateTracker, domain: Domain
     ) -> List[float]:
-        """Predicts the next action the bot should take
-        after seeing the tracker.
+        """Predicts the next action the bot should take after seeing the tracker.
+
+        Args:
+            tracker: the :class:`rasa.core.trackers.DialogueStateTracker`
+            domain: the :class:`rasa.core.domain.Domain`
 
-        Returns the list of probabilities for the next actions"""
+        Returns:
+             the list of probabilities for the next actions
+        """
 
         raise NotImplementedError("Policy must have the capacity to predict.")
 
     def persist(self, path: Text) -> None:
-        """Persists the policy to a storage."""
+        """Persists the policy to a storage.
+
+        Args:
+            path: the path where to save the policy to
+        """
+
         raise NotImplementedError("Policy must have the capacity to persist itself.")
 
     @classmethod
     def load(cls, path: Text) -> "Policy":
         """Loads a policy from the storage.
-            Needs to load its featurizer"""
+
+        Needs to load its featurizer.
+
+        Args:
+            path: the path from where to load the policy
+        """
+
         raise NotImplementedError("Policy must have the capacity to load itself.")
 
+    @staticmethod
+    def _default_predictions(domain: Domain) -> List[float]:
+        """Creates a list of zeros.
+
+        Args:
+            domain: the :class:`rasa.core.domain.Domain`
+        Returns:
+            the list of the length of the number of actions
+        """
 
-def confidence_scores_for(action_name, value, domain) -> List[float]:
+        return [0.0] * domain.num_actions
+
+
+def confidence_scores_for(
+    action_name: Text, value: float, domain: Domain
+) -> List[float]:
     """Returns confidence scores if a single action is predicted.
 
     Args:
-        action_name: Name of action for which the score should be set.
-        value: Confidence for `action_name`.
-        domain: Domain which contains all actions.
-
-    Returns: List of length `len(nr_actions)`.
+        action_name: the name of the action for which the score should be set
+        value: the confidence for `action_name`
+        domain: the :class:`rasa.core.domain.Domain`
 
+    Returns:
+        the list of the length of the number of actions
     """
+
     results = [0.0] * domain.num_actions
     idx = domain.index_for_action(action_name)
     results[idx] = value
diff --git a/rasa/core/policies/registry.py b/rasa/core/policies/registry.py
index 6d11c97b3b0f..5578ad317275 100644
--- a/rasa/core/policies/registry.py
+++ b/rasa/core/policies/registry.py
@@ -1,6 +1,9 @@
 # Import all policies at one place to be able to to resolve them via a common module
 # path. Don't do this in `__init__.py` to avoid importing them without need.
 
+# noinspection PyUnresolvedReferences
+from rasa.core.policies.ted_policy import TEDPolicy
+
 # noinspection PyUnresolvedReferences
 from rasa.core.policies.embedding_policy import EmbeddingPolicy
 
@@ -24,3 +27,6 @@
 
 # noinspection PyUnresolvedReferences
 from rasa.core.policies.mapping_policy import MappingPolicy
+
+# noinspection PyUnresolvedReferences
+from rasa.core.policies.embedding_policy import EmbeddingPolicy
diff --git a/rasa/core/policies/sklearn_policy.py b/rasa/core/policies/sklearn_policy.py
index c25326a77802..fee324f4432d 100644
--- a/rasa/core/policies/sklearn_policy.py
+++ b/rasa/core/policies/sklearn_policy.py
@@ -176,8 +176,7 @@ def persist(self, path: Text) -> None:
             rasa.utils.io.dump_obj_as_json_to_file(meta_file, meta)
 
             filename = os.path.join(path, "sklearn_model.pkl")
-            with open(filename, "wb") as f:
-                pickle.dump(self._state, f)
+            rasa.utils.io.pickle_dump(filename, self._state)
         else:
             raise_warning(
                 "Persist called without a trained model present. "
@@ -201,10 +200,11 @@ def load(cls, path: Text) -> Policy:
 
         meta_file = os.path.join(path, "sklearn_policy.json")
         meta = json.loads(rasa.utils.io.read_file(meta_file))
+
         policy = cls(featurizer=featurizer, priority=meta["priority"])
 
-        with open(filename, "rb") as f:
-            state = pickle.load(f)
+        state = rasa.utils.io.pickle_load(filename)
+
         vars(policy).update(state)
 
         logger.info("Loaded sklearn model")
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
new file mode 100644
index 000000000000..3a78c873f393
--- /dev/null
+++ b/rasa/core/policies/ted_policy.py
@@ -0,0 +1,631 @@
+import copy
+import logging
+import os
+from pathlib import Path
+
+import numpy as np
+import tensorflow as tf
+import tensorflow_addons as tfa
+
+from typing import Any, List, Optional, Text, Dict, Tuple, Union
+
+import rasa.utils.io as io_utils
+from rasa.core.domain import Domain
+from rasa.core.featurizers import (
+    TrackerFeaturizer,
+    FullDialogueTrackerFeaturizer,
+    LabelTokenizerSingleStateFeaturizer,
+    MaxHistoryTrackerFeaturizer,
+)
+from rasa.core.policies.policy import Policy
+from rasa.core.constants import DEFAULT_POLICY_PRIORITY, DIALOGUE
+from rasa.core.trackers import DialogueStateTracker
+from rasa.utils import train_utils
+from rasa.utils.tensorflow import layers
+from rasa.utils.tensorflow.transformer import TransformerEncoder
+from rasa.utils.tensorflow.models import RasaModel
+from rasa.utils.tensorflow.model_data import RasaModelData, FeatureSignature
+from rasa.utils.tensorflow.constants import (
+    LABEL,
+    HIDDEN_LAYERS_SIZES,
+    TRANSFORMER_SIZE,
+    NUM_TRANSFORMER_LAYERS,
+    NUM_HEADS,
+    BATCH_SIZES,
+    BATCH_STRATEGY,
+    EPOCHS,
+    RANDOM_SEED,
+    RANKING_LENGTH,
+    LOSS_TYPE,
+    SIMILARITY_TYPE,
+    NUM_NEG,
+    EVAL_NUM_EXAMPLES,
+    EVAL_NUM_EPOCHS,
+    NEGATIVE_MARGIN_SCALE,
+    REGULARIZATION_CONSTANT,
+    SCALE_LOSS,
+    USE_MAX_NEG_SIM,
+    MAX_NEG_SIM,
+    MAX_POS_SIM,
+    EMBEDDING_DIMENSION,
+    DROP_RATE_DIALOGUE,
+    DROP_RATE_LABEL,
+    DROP_RATE_ATTENTION,
+    WEIGHT_SPARSITY,
+    KEY_RELATIVE_ATTENTION,
+    VALUE_RELATIVE_ATTENTION,
+    MAX_RELATIVE_POSITION,
+    SOFTMAX,
+    AUTO,
+    BALANCED,
+)
+
+
+logger = logging.getLogger(__name__)
+
+DIALOGUE_FEATURES = f"{DIALOGUE}_features"
+LABEL_FEATURES = f"{LABEL}_features"
+LABEL_IDS = f"{LABEL}_ids"
+
+SAVE_MODEL_FILE_NAME = "ted_policy"
+
+
+class TEDPolicy(Policy):
+    """Transformer Embedding Dialogue (TED) Policy is described in
+    https://arxiv.org/abs/1910.00486.
+
+    This policy has a pre-defined architecture, which comprises the
+    following steps:
+        - concatenate user input (user intent and entities), previous system actions,
+          slots and active forms for each time step into an input vector to
+          pre-transformer embedding layer;
+        - feed it to transformer;
+        - apply a dense layer to the output of the transformer to get embeddings of a
+          dialogue for each time step;
+        - apply a dense layer to create embeddings for system actions for each time
+          step;
+        - calculate the similarity between the dialogue embedding and embedded system
+          actions. This step is based on the StarSpace
+          (https://arxiv.org/abs/1709.03856) idea.
+    """
+
+    SUPPORTS_ONLINE_TRAINING = True
+
+    # please make sure to update the docs when changing a default parameter
+    defaults = {
+        # ## Architecture of the used neural network
+        # Hidden layer sizes for layers before the dialogue and label embedding layers.
+        # The number of hidden layers is equal to the length of the corresponding
+        # list.
+        HIDDEN_LAYERS_SIZES: {DIALOGUE: [], LABEL: []},
+        # Number of units in transformer
+        TRANSFORMER_SIZE: 128,
+        # Number of transformer layers
+        NUM_TRANSFORMER_LAYERS: 1,
+        # Number of attention heads in transformer
+        NUM_HEADS: 4,
+        # If 'True' use key relative embeddings in attention
+        KEY_RELATIVE_ATTENTION: False,
+        # If 'True' use key relative embeddings in attention
+        VALUE_RELATIVE_ATTENTION: False,
+        # Max position for relative embeddings
+        MAX_RELATIVE_POSITION: None,
+        # ## Training parameters
+        # Initial and final batch sizes:
+        # Batch size will be linearly increased for each epoch.
+        BATCH_SIZES: [8, 32],
+        # Strategy used whenc creating batches.
+        # Can be either 'sequence' or 'balanced'.
+        BATCH_STRATEGY: BALANCED,
+        # Number of epochs to train
+        EPOCHS: 1,
+        # Set random seed to any 'int' to get reproducible results
+        RANDOM_SEED: None,
+        # ## Parameters for embeddings
+        # Dimension size of embedding vectors
+        EMBEDDING_DIMENSION: 20,
+        # The number of incorrect labels. The algorithm will minimize
+        # their similarity to the user input during training.
+        NUM_NEG: 20,
+        # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
+        SIMILARITY_TYPE: AUTO,
+        # The type of the loss function, either 'softmax' or 'margin'.
+        LOSS_TYPE: SOFTMAX,
+        # Number of top actions to normalize scores for loss type 'softmax'.
+        # Set to 0 to turn off normalization.
+        RANKING_LENGTH: 10,
+        # Indicates how similar the algorithm should try to make embedding vectors
+        # for correct labels.
+        # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_POS_SIM: 0.8,
+        # Maximum negative similarity for incorrect labels.
+        # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_NEG_SIM: -0.2,
+        # If 'True' the algorithm only minimizes maximum similarity over
+        # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
+        USE_MAX_NEG_SIM: True,
+        # Scale loss inverse proportionally to confidence of correct prediction
+        SCALE_LOSS: True,
+        # ## Regularization parameters
+        # The scale of regularization
+        REGULARIZATION_CONSTANT: 0.001,
+        # The scale of how important is to minimize the maximum similarity
+        # between embeddings of different labels.
+        NEGATIVE_MARGIN_SCALE: 0.8,
+        # Dropout rate for embedding layers of dialogue features.
+        DROP_RATE_DIALOGUE: 0.1,
+        # Dropout rate for embedding layers of label, e.g. action, features.
+        DROP_RATE_LABEL: 0.0,
+        # Dropout rate for attention.
+        DROP_RATE_ATTENTION: 0,
+        # Sparsity of the weights in dense layers
+        WEIGHT_SPARSITY: 0.8,
+        # ## Evaluation parameters
+        # How often calculate validation accuracy.
+        # Small values may hurt performance, e.g. model accuracy.
+        EVAL_NUM_EPOCHS: 20,
+        # How many examples to use for hold out validation set
+        # Large values may hurt performance, e.g. model accuracy.
+        EVAL_NUM_EXAMPLES: 0,
+    }
+
+    @staticmethod
+    def _standard_featurizer(max_history: Optional[int] = None) -> TrackerFeaturizer:
+        if max_history is None:
+            return FullDialogueTrackerFeaturizer(LabelTokenizerSingleStateFeaturizer())
+        else:
+            return MaxHistoryTrackerFeaturizer(
+                LabelTokenizerSingleStateFeaturizer(), max_history=max_history
+            )
+
+    def __init__(
+        self,
+        featurizer: Optional[TrackerFeaturizer] = None,
+        priority: int = DEFAULT_POLICY_PRIORITY,
+        max_history: Optional[int] = None,
+        model: Optional[RasaModel] = None,
+        **kwargs: Dict[Text, Any],
+    ) -> None:
+        """Declare instance variables with default values."""
+
+        if not featurizer:
+            featurizer = self._standard_featurizer(max_history)
+
+        super().__init__(featurizer, priority)
+
+        self._load_params(**kwargs)
+
+        self.model = model
+
+        self._label_data: Optional[RasaModelData] = None
+        self.data_example: Optional[Dict[Text, List[np.ndarray]]] = None
+
+    def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
+        self.config = copy.deepcopy(self.defaults)
+        self.config.update(kwargs)
+
+        self.config = train_utils.check_deprecated_options(self.config)
+
+        self.config = train_utils.update_similarity_type(self.config)
+        self.config = train_utils.update_evaluation_parameters(self.config)
+
+    # data helpers
+    # noinspection PyPep8Naming
+    @staticmethod
+    def _label_ids_for_Y(data_Y: np.ndarray) -> np.ndarray:
+        """Prepare Y data for training: extract label_ids.
+
+        label_ids are indices of labels, while `data_Y` contains one-hot encodings.
+        """
+
+        return data_Y.argmax(axis=-1)
+
+    # noinspection PyPep8Naming
+    def _label_features_for_Y(self, label_ids: np.ndarray) -> np.ndarray:
+        """Prepare Y data for training: features for label_ids."""
+
+        all_label_features = self._label_data.get(LABEL_FEATURES)[0]
+
+        is_full_dialogue_featurizer_used = len(label_ids.shape) == 2
+        if is_full_dialogue_featurizer_used:
+            return np.stack(
+                [
+                    np.stack(
+                        [all_label_features[label_idx] for label_idx in seq_label_ids]
+                    )
+                    for seq_label_ids in label_ids
+                ]
+            )
+
+        # max history featurizer is used
+        return np.stack([all_label_features[label_idx] for label_idx in label_ids])
+
+    # noinspection PyPep8Naming
+    def _create_model_data(
+        self, data_X: np.ndarray, data_Y: Optional[np.ndarray] = None
+    ) -> RasaModelData:
+        """Combine all model related data into RasaModelData."""
+
+        label_ids = np.array([])
+        Y = np.array([])
+
+        if data_Y is not None:
+            label_ids = self._label_ids_for_Y(data_Y)
+            Y = self._label_features_for_Y(label_ids)
+            # explicitly add last dimension to label_ids
+            # to track correctly dynamic sequences
+            label_ids = np.expand_dims(label_ids, -1)
+
+        model_data = RasaModelData(label_key=LABEL_IDS)
+        model_data.add_features(DIALOGUE_FEATURES, [data_X])
+        model_data.add_features(LABEL_FEATURES, [Y])
+        model_data.add_features(LABEL_IDS, [label_ids])
+
+        return model_data
+
+    def _create_label_data(self, domain: Domain) -> RasaModelData:
+        # encode all label_ids with policies' featurizer
+        state_featurizer = self.featurizer.state_featurizer
+        all_labels = state_featurizer.create_encoded_all_actions(domain)
+        all_labels = all_labels.astype(np.float32)
+
+        label_data = RasaModelData()
+        label_data.add_features(LABEL_FEATURES, [all_labels])
+        return label_data
+
+    def train(
+        self,
+        training_trackers: List[DialogueStateTracker],
+        domain: Domain,
+        **kwargs: Any,
+    ) -> None:
+        """Train the policy on given training trackers."""
+
+        # dealing with training data
+        training_data = self.featurize_for_training(training_trackers, domain, **kwargs)
+
+        self._label_data = self._create_label_data(domain)
+
+        # extract actual training data to feed to model
+        model_data = self._create_model_data(training_data.X, training_data.y)
+        if model_data.is_empty():
+            logger.error(
+                f"Can not train '{self.__class__.__name__}'. No data was provided. "
+                f"Skipping training of the policy."
+            )
+            return
+
+        # keep one example for persisting and loading
+        self.data_example = model_data.first_data_example()
+
+        self.model = TED(
+            model_data.get_signature(),
+            self.config,
+            isinstance(self.featurizer, MaxHistoryTrackerFeaturizer),
+            self._label_data,
+        )
+
+        self.model.fit(
+            model_data,
+            self.config[EPOCHS],
+            self.config[BATCH_SIZES],
+            self.config[EVAL_NUM_EXAMPLES],
+            self.config[EVAL_NUM_EPOCHS],
+            batch_strategy=self.config[BATCH_STRATEGY],
+        )
+
+    def predict_action_probabilities(
+        self, tracker: DialogueStateTracker, domain: Domain
+    ) -> List[float]:
+        """Predict the next action the bot should take.
+
+        Return the list of probabilities for the next actions.
+        """
+
+        if self.model is None:
+            return self._default_predictions(domain)
+
+        # create model data from tracker
+        data_X = self.featurizer.create_X([tracker], domain)
+        model_data = self._create_model_data(data_X)
+
+        output = self.model.predict(model_data)
+
+        confidence = output["action_scores"].numpy()
+        # remove batch dimension and take the last prediction in the sequence
+        confidence = confidence[0, -1, :]
+
+        if self.config[LOSS_TYPE] == SOFTMAX and self.config[RANKING_LENGTH] > 0:
+            confidence = train_utils.normalize(confidence, self.config[RANKING_LENGTH])
+
+        return confidence.tolist()
+
+    def persist(self, path: Text) -> None:
+        """Persists the policy to a storage."""
+
+        if self.model is None:
+            logger.debug(
+                "Method `persist(...)` was called "
+                "without a trained model present. "
+                "Nothing to persist then!"
+            )
+            return
+
+        model_path = Path(path)
+        tf_model_file = model_path / f"{SAVE_MODEL_FILE_NAME}.tf_model"
+
+        io_utils.create_directory_for_file(tf_model_file)
+
+        self.featurizer.persist(path)
+
+        self.model.save(str(tf_model_file))
+
+        io_utils.json_pickle(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.priority.pkl", self.priority
+        )
+        io_utils.pickle_dump(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.meta.pkl", self.config
+        )
+        io_utils.json_pickle(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.pkl", self.data_example
+        )
+        io_utils.json_pickle(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.label_data.pkl", self._label_data
+        )
+
+    @classmethod
+    def load(cls, path: Text) -> "TEDPolicy":
+        """Loads a policy from the storage.
+
+        **Needs to load its featurizer**
+        """
+
+        if not os.path.exists(path):
+            raise Exception(
+                f"Failed to load TED policy model. Path "
+                f"'{os.path.abspath(path)}' doesn't exist."
+            )
+
+        model_path = Path(path)
+        tf_model_file = model_path / f"{SAVE_MODEL_FILE_NAME}.tf_model"
+
+        featurizer = TrackerFeaturizer.load(path)
+
+        if not (model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.pkl").is_file():
+            return cls(featurizer=featurizer)
+
+        loaded_data = io_utils.json_unpickle(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.pkl"
+        )
+        label_data = io_utils.json_unpickle(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.label_data.pkl"
+        )
+        meta = io_utils.pickle_load(model_path / f"{SAVE_MODEL_FILE_NAME}.meta.pkl")
+        priority = io_utils.json_unpickle(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.priority.pkl"
+        )
+
+        model_data_example = RasaModelData(label_key=LABEL_IDS, data=loaded_data)
+        meta = train_utils.update_similarity_type(meta)
+
+        model = TED.load(
+            str(tf_model_file),
+            model_data_example,
+            data_signature=model_data_example.get_signature(),
+            config=meta,
+            max_history_tracker_featurizer_used=isinstance(
+                featurizer, MaxHistoryTrackerFeaturizer
+            ),
+            label_data=label_data,
+        )
+
+        # build the graph for prediction
+        predict_data_example = RasaModelData(
+            label_key=LABEL_IDS,
+            data={
+                feature_name: features
+                for feature_name, features in model_data_example.items()
+                if DIALOGUE in feature_name
+            },
+        )
+        model.build_for_predict(predict_data_example)
+
+        return cls(featurizer=featurizer, priority=priority, model=model, **meta)
+
+
+# accessing _tf_layers with any key results in key-error, disable it
+# pytype: disable=key-error
+
+
+class TED(RasaModel):
+    def __init__(
+        self,
+        data_signature: Dict[Text, List[FeatureSignature]],
+        config: Dict[Text, Any],
+        max_history_tracker_featurizer_used: bool,
+        label_data: RasaModelData,
+    ) -> None:
+        super().__init__(name="TED", random_seed=config[RANDOM_SEED])
+
+        self.config = config
+        self.max_history_tracker_featurizer_used = max_history_tracker_featurizer_used
+
+        # data
+        self.data_signature = data_signature
+        self._check_data()
+
+        self.predict_data_signature = {
+            feature_name: features
+            for feature_name, features in data_signature.items()
+            if DIALOGUE in feature_name
+        }
+
+        # optimizer
+        self._set_optimizer(tf.keras.optimizers.Adam())
+
+        self.all_labels_embed = None
+
+        label_batch = label_data.prepare_batch()
+        self.tf_label_data = self.batch_to_model_data_format(
+            label_batch, label_data.get_signature()
+        )
+
+        # metrics
+        self.action_loss = tf.keras.metrics.Mean(name="loss")
+        self.action_acc = tf.keras.metrics.Mean(name="acc")
+        self.metrics_to_log += ["loss", "acc"]
+
+        # set up tf layers
+        self._tf_layers: Dict[Text : tf.keras.layers.Layer] = {}
+        self._prepare_layers()
+
+    def _check_data(self) -> None:
+        if DIALOGUE_FEATURES not in self.data_signature:
+            raise ValueError(
+                f"No text features specified. "
+                f"Cannot train '{self.__class__.__name__}' model."
+            )
+        if LABEL_FEATURES not in self.data_signature:
+            raise ValueError(
+                f"No label features specified. "
+                f"Cannot train '{self.__class__.__name__}' model."
+            )
+
+    def _prepare_layers(self) -> None:
+        self._tf_layers[f"loss.{LABEL}"] = layers.DotProductLoss(
+            self.config[NUM_NEG],
+            self.config[LOSS_TYPE],
+            self.config[MAX_POS_SIM],
+            self.config[MAX_NEG_SIM],
+            self.config[USE_MAX_NEG_SIM],
+            self.config[NEGATIVE_MARGIN_SCALE],
+            self.config[SCALE_LOSS],
+            # set to 1 to get deterministic behaviour
+            parallel_iterations=1 if self.random_seed is not None else 1000,
+        )
+        self._tf_layers[f"ffnn.{DIALOGUE}"] = layers.Ffnn(
+            self.config[HIDDEN_LAYERS_SIZES][DIALOGUE],
+            self.config[DROP_RATE_DIALOGUE],
+            self.config[REGULARIZATION_CONSTANT],
+            self.config[WEIGHT_SPARSITY],
+            layer_name_suffix=DIALOGUE,
+        )
+        self._tf_layers[f"ffnn.{LABEL}"] = layers.Ffnn(
+            self.config[HIDDEN_LAYERS_SIZES][LABEL],
+            self.config[DROP_RATE_LABEL],
+            self.config[REGULARIZATION_CONSTANT],
+            self.config[WEIGHT_SPARSITY],
+            layer_name_suffix=LABEL,
+        )
+        self._tf_layers["transformer"] = TransformerEncoder(
+            self.config[NUM_TRANSFORMER_LAYERS],
+            self.config[TRANSFORMER_SIZE],
+            self.config[NUM_HEADS],
+            self.config[TRANSFORMER_SIZE] * 4,
+            self.config[REGULARIZATION_CONSTANT],
+            dropout_rate=self.config[DROP_RATE_DIALOGUE],
+            attention_dropout_rate=self.config[DROP_RATE_ATTENTION],
+            sparsity=self.config[WEIGHT_SPARSITY],
+            unidirectional=True,
+            use_key_relative_position=self.config[KEY_RELATIVE_ATTENTION],
+            use_value_relative_position=self.config[VALUE_RELATIVE_ATTENTION],
+            max_relative_position=self.config[MAX_RELATIVE_POSITION],
+            name=DIALOGUE + "_encoder",
+        )
+        self._tf_layers[f"embed.{DIALOGUE}"] = layers.Embed(
+            self.config[EMBEDDING_DIMENSION],
+            self.config[REGULARIZATION_CONSTANT],
+            DIALOGUE,
+            self.config[SIMILARITY_TYPE],
+        )
+        self._tf_layers[f"embed.{LABEL}"] = layers.Embed(
+            self.config[EMBEDDING_DIMENSION],
+            self.config[REGULARIZATION_CONSTANT],
+            LABEL,
+            self.config[SIMILARITY_TYPE],
+        )
+
+    def _create_all_labels_embed(self) -> Tuple[tf.Tensor, tf.Tensor]:
+        all_labels = self.tf_label_data[LABEL_FEATURES][0]
+        all_labels_embed = self._embed_label(all_labels)
+
+        return all_labels, all_labels_embed
+
+    def _emebed_dialogue(self, dialogue_in: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+        """Create dialogue level embedding and mask."""
+
+        # mask different length sequences
+        # if there is at least one `-1` it should be masked
+        mask = tf.sign(tf.reduce_max(dialogue_in, axis=-1) + 1)
+
+        dialogue = self._tf_layers[f"ffnn.{DIALOGUE}"](dialogue_in, self._training)
+        dialogue_transformed = self._tf_layers["transformer"](
+            dialogue, 1 - tf.expand_dims(mask, axis=-1), self._training
+        )
+        dialogue_transformed = tfa.activations.gelu(dialogue_transformed)
+
+        if self.max_history_tracker_featurizer_used:
+            # pick last label if max history featurizer is used
+            dialogue_transformed = dialogue_transformed[:, -1:, :]
+            mask = mask[:, -1:]
+
+        dialogue_embed = self._tf_layers[f"embed.{DIALOGUE}"](dialogue_transformed)
+
+        return dialogue_embed, mask
+
+    def _embed_label(self, label_in: Union[tf.Tensor, np.ndarray]) -> tf.Tensor:
+        label = self._tf_layers[f"ffnn.{LABEL}"](label_in, self._training)
+        return self._tf_layers[f"embed.{LABEL}"](label)
+
+    def batch_loss(
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
+    ) -> tf.Tensor:
+        batch = self.batch_to_model_data_format(batch_in, self.data_signature)
+
+        dialogue_in = batch[DIALOGUE_FEATURES][0]
+        label_in = batch[LABEL_FEATURES][0]
+
+        if self.max_history_tracker_featurizer_used:
+            # add time dimension if max history featurizer is used
+            label_in = label_in[:, tf.newaxis, :]
+
+        all_labels, all_labels_embed = self._create_all_labels_embed()
+
+        dialogue_embed, mask = self._emebed_dialogue(dialogue_in)
+        label_embed = self._embed_label(label_in)
+
+        loss, acc = self._tf_layers[f"loss.{LABEL}"](
+            dialogue_embed, label_embed, label_in, all_labels_embed, all_labels, mask
+        )
+
+        self.action_loss.update_state(loss)
+        self.action_acc.update_state(acc)
+
+        return loss
+
+    def batch_predict(
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
+    ) -> Dict[Text, tf.Tensor]:
+        batch = self.batch_to_model_data_format(batch_in, self.predict_data_signature)
+
+        dialogue_in = batch[DIALOGUE_FEATURES][0]
+
+        if self.all_labels_embed is None:
+            _, self.all_labels_embed = self._create_all_labels_embed()
+
+        dialogue_embed, mask = self._emebed_dialogue(dialogue_in)
+
+        sim_all = self._tf_layers[f"loss.{LABEL}"].sim(
+            dialogue_embed[:, :, tf.newaxis, :],
+            self.all_labels_embed[tf.newaxis, tf.newaxis, :, :],
+            mask,
+        )
+
+        scores = self._tf_layers[f"loss.{LABEL}"].confidence_from_sim(
+            sim_all, self.config[SIMILARITY_TYPE]
+        )
+
+        return {"action_scores": scores}
+
+
+# pytype: enable=key-error
diff --git a/rasa/core/processor.py b/rasa/core/processor.py
index 3e5219b7a4d9..7ea78c3638fc 100644
--- a/rasa/core/processor.py
+++ b/rasa/core/processor.py
@@ -149,7 +149,7 @@ async def predict_next(self, sender_id: Text) -> Optional[Dict[Text, Any]]:
         }
 
     async def _update_tracker_session(
-        self, tracker: DialogueStateTracker, output_channel: OutputChannel,
+        self, tracker: DialogueStateTracker, output_channel: OutputChannel
     ) -> None:
         """Check the current session in `tracker` and update it if expired.
 
@@ -175,7 +175,7 @@ async def _update_tracker_session(
             )
 
     async def get_tracker_with_session_start(
-        self, sender_id: Text, output_channel: Optional[OutputChannel] = None,
+        self, sender_id: Text, output_channel: Optional[OutputChannel] = None
     ) -> Optional[DialogueStateTracker]:
         """Get tracker for `sender_id` or create a new tracker for `sender_id`.
 
diff --git a/rasa/core/training/interactive.py b/rasa/core/training/interactive.py
index 267751e44205..1e84af364af8 100644
--- a/rasa/core/training/interactive.py
+++ b/rasa/core/training/interactive.py
@@ -1454,7 +1454,7 @@ async def record_messages(
 
 
 async def _get_tracker_events_to_plot(
-    domain: Dict[Text, Any], file_importer: TrainingDataImporter, conversation_id: Text,
+    domain: Dict[Text, Any], file_importer: TrainingDataImporter, conversation_id: Text
 ) -> List[Union[Text, List[Event]]]:
     training_trackers = await _get_training_trackers(file_importer, domain)
     number_of_trackers = len(training_trackers)
diff --git a/rasa/core/training/story_conflict.py b/rasa/core/training/story_conflict.py
index 2510608a68b8..d77452c4b990 100644
--- a/rasa/core/training/story_conflict.py
+++ b/rasa/core/training/story_conflict.py
@@ -1,12 +1,12 @@
 import logging
-from collections import defaultdict, namedtuple
+from collections import defaultdict
 from typing import List, Optional, Dict, Text, Tuple, Generator, NamedTuple
 
 from rasa.core.actions.action import ACTION_LISTEN_NAME
 from rasa.core.domain import PREV_PREFIX, Domain
 from rasa.core.events import ActionExecuted, Event
 from rasa.core.featurizers import MaxHistoryTrackerFeaturizer
-from rasa.nlu.constants import INTENT_ATTRIBUTE
+from rasa.nlu.constants import INTENT
 from rasa.core.training.generator import TrackerWithCachedStates
 
 logger = logging.getLogger(__name__)
@@ -25,7 +25,7 @@ class StoryConflict:
                                    prior events (i.e. at the beginning of a dialogue).
     """
 
-    def __init__(self, sliced_states: List[Optional[Dict[Text, float]]],) -> None:
+    def __init__(self, sliced_states: List[Optional[Dict[Text, float]]]) -> None:
         """
         Creates a `StoryConflict` from a given state.
 
@@ -314,10 +314,10 @@ def _get_previous_event(
         ):
             # The `prev_...` was an action that was NOT `action_listen`
             return "action", turn_label.replace(PREV_PREFIX, "")
-        elif turn_label.startswith(INTENT_ATTRIBUTE + "_"):
+        elif turn_label.startswith(INTENT + "_"):
             # We found an intent, but it is only the previous event if
             # the `prev_...` was `prev_action_listen`, so we don't return.
             previous_event_type = "intent"
-            previous_event_name = turn_label.replace(INTENT_ATTRIBUTE + "_", "")
+            previous_event_name = turn_label.replace(INTENT + "_", "")
 
     return previous_event_type, previous_event_name
diff --git a/rasa/nlu/classifiers/classifier.py b/rasa/nlu/classifiers/classifier.py
new file mode 100644
index 000000000000..ee9d5cc73373
--- /dev/null
+++ b/rasa/nlu/classifiers/classifier.py
@@ -0,0 +1,5 @@
+from rasa.nlu.components import Component
+
+
+class IntentClassifier(Component):
+    pass
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
new file mode 100644
index 000000000000..0039770fb8f6
--- /dev/null
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -0,0 +1,1431 @@
+import logging
+from pathlib import Path
+
+import numpy as np
+import os
+import scipy.sparse
+import tensorflow as tf
+import tensorflow_addons as tfa
+
+from typing import Any, Dict, List, Optional, Text, Tuple, Union, Type
+
+import rasa.utils.io as io_utils
+import rasa.nlu.utils.bilou_utils as bilou_utils
+from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.components import Component
+from rasa.nlu.classifiers.classifier import IntentClassifier
+from rasa.nlu.extractors.extractor import EntityExtractor
+from rasa.nlu.test import determine_token_labels
+from rasa.nlu.tokenizers.tokenizer import Token
+from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
+from rasa.utils import train_utils
+from rasa.utils.tensorflow import layers
+from rasa.utils.tensorflow.transformer import TransformerEncoder
+from rasa.utils.tensorflow.models import RasaModel
+from rasa.utils.tensorflow.model_data import RasaModelData, FeatureSignature
+from rasa.nlu.constants import (
+    INTENT,
+    TEXT,
+    ENTITIES,
+    NO_ENTITY_TAG,
+    SPARSE_FEATURE_NAMES,
+    DENSE_FEATURE_NAMES,
+    TOKENS_NAMES,
+)
+from rasa.nlu.config import RasaNLUModelConfig, InvalidConfigError
+from rasa.nlu.training_data import TrainingData
+from rasa.nlu.model import Metadata
+from rasa.nlu.training_data import Message
+from rasa.utils.tensorflow.constants import (
+    LABEL,
+    HIDDEN_LAYERS_SIZES,
+    SHARE_HIDDEN_LAYERS,
+    TRANSFORMER_SIZE,
+    NUM_TRANSFORMER_LAYERS,
+    NUM_HEADS,
+    BATCH_SIZES,
+    BATCH_STRATEGY,
+    EPOCHS,
+    RANDOM_SEED,
+    LEARNING_RATE,
+    DENSE_DIMENSION,
+    RANKING_LENGTH,
+    LOSS_TYPE,
+    SIMILARITY_TYPE,
+    NUM_NEG,
+    SPARSE_INPUT_DROPOUT,
+    MASKED_LM,
+    ENTITY_RECOGNITION,
+    INTENT_CLASSIFICATION,
+    EVAL_NUM_EXAMPLES,
+    EVAL_NUM_EPOCHS,
+    UNIDIRECTIONAL_ENCODER,
+    DROP_RATE,
+    DROP_RATE_ATTENTION,
+    WEIGHT_SPARSITY,
+    NEGATIVE_MARGIN_SCALE,
+    REGULARIZATION_CONSTANT,
+    SCALE_LOSS,
+    USE_MAX_NEG_SIM,
+    MAX_NEG_SIM,
+    MAX_POS_SIM,
+    EMBEDDING_DIMENSION,
+    BILOU_FLAG,
+    KEY_RELATIVE_ATTENTION,
+    VALUE_RELATIVE_ATTENTION,
+    MAX_RELATIVE_POSITION,
+    SOFTMAX,
+    AUTO,
+    BALANCED,
+)
+
+
+logger = logging.getLogger(__name__)
+
+TEXT_FEATURES = f"{TEXT}_features"
+LABEL_FEATURES = f"{LABEL}_features"
+TEXT_MASK = f"{TEXT}_mask"
+LABEL_MASK = f"{LABEL}_mask"
+LABEL_IDS = f"{LABEL}_ids"
+TAG_IDS = "tag_ids"
+
+
+class DIETClassifier(IntentClassifier, EntityExtractor):
+    """DIET (Dual Intent and Entity Transformer) is a multi-task architecture for
+    intent classification and entity recognition.
+
+    The architecture is based on a transformer which is shared for both tasks.
+    A sequence of entity labels is predicted through a Conditional Random Field (CRF)
+    tagging layer on top of the transformer output sequence corresponding to the
+    input sequence of tokens. The transformer output for the ``__CLS__`` token and
+    intent labels are embedded into a single semantic vector space. We use the
+    dot-product loss to maximize the similarity with the target label and minimize
+    similarities with negative samples.
+    """
+
+    @classmethod
+    def required_components(cls) -> List[Type[Component]]:
+        return [Featurizer]
+
+    # please make sure to update the docs when changing a default parameter
+    defaults = {
+        # ## Architecture of the used neural network
+        # Hidden layer sizes for layers before the embedding layers for user message
+        # and labels.
+        # The number of hidden layers is equal to the length of the corresponding
+        # list.
+        HIDDEN_LAYERS_SIZES: {TEXT: [], LABEL: []},
+        # Whether to share the hidden layer weights between user message and labels.
+        SHARE_HIDDEN_LAYERS: False,
+        # Number of units in transformer
+        TRANSFORMER_SIZE: 256,
+        # Number of transformer layers
+        NUM_TRANSFORMER_LAYERS: 2,
+        # Number of attention heads in transformer
+        NUM_HEADS: 4,
+        # If 'True' use key relative embeddings in attention
+        KEY_RELATIVE_ATTENTION: False,
+        # If 'True' use key relative embeddings in attention
+        VALUE_RELATIVE_ATTENTION: False,
+        # Max position for relative embeddings
+        MAX_RELATIVE_POSITION: None,
+        # Use a unidirectional or bidirectional encoder.
+        UNIDIRECTIONAL_ENCODER: False,
+        # ## Training parameters
+        # Initial and final batch sizes:
+        # Batch size will be linearly increased for each epoch.
+        BATCH_SIZES: [64, 256],
+        # Strategy used when creating batches.
+        # Can be either 'sequence' or 'balanced'.
+        BATCH_STRATEGY: BALANCED,
+        # Number of epochs to train
+        EPOCHS: 300,
+        # Set random seed to any 'int' to get reproducible results
+        RANDOM_SEED: None,
+        # Initial learning rate for the optimizer
+        LEARNING_RATE: 0.001,
+        # ## Parameters for embeddings
+        # Dimension size of embedding vectors
+        EMBEDDING_DIMENSION: 20,
+        # Default dense dimension to use if no dense features are present.
+        DENSE_DIMENSION: {TEXT: 512, LABEL: 20},
+        # The number of incorrect labels. The algorithm will minimize
+        # their similarity to the user input during training.
+        NUM_NEG: 20,
+        # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
+        SIMILARITY_TYPE: AUTO,
+        # The type of the loss function, either 'softmax' or 'margin'.
+        LOSS_TYPE: SOFTMAX,
+        # Number of top actions to normalize scores for loss type 'softmax'.
+        # Set to 0 to turn off normalization.
+        RANKING_LENGTH: 10,
+        # Indicates how similar the algorithm should try to make embedding vectors
+        # for correct labels.
+        # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_POS_SIM: 0.8,
+        # Maximum negative similarity for incorrect labels.
+        # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_NEG_SIM: -0.4,
+        # If 'True' the algorithm only minimizes maximum similarity over
+        # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
+        USE_MAX_NEG_SIM: True,
+        # Scale loss inverse proportionally to confidence of correct prediction
+        SCALE_LOSS: True,
+        # ## Regularization parameters
+        # The scale of regularization
+        REGULARIZATION_CONSTANT: 0.002,
+        # The scale of how important is to minimize the maximum similarity
+        # between embeddings of different labels.
+        NEGATIVE_MARGIN_SCALE: 0.8,
+        # Dropout rate for encoder
+        DROP_RATE: 0.2,
+        # Dropout rate for attention
+        DROP_RATE_ATTENTION: 0,
+        # Sparsity of the weights in dense layers
+        WEIGHT_SPARSITY: 0.8,
+        # If 'True' apply dropout to sparse tensors
+        SPARSE_INPUT_DROPOUT: True,
+        # ## Evaluation parameters
+        # How often calculate validation accuracy.
+        # Small values may hurt performance, e.g. model accuracy.
+        EVAL_NUM_EPOCHS: 20,
+        # How many examples to use for hold out validation set
+        # Large values may hurt performance, e.g. model accuracy.
+        EVAL_NUM_EXAMPLES: 0,
+        # ## Model config
+        # If 'True' intent classification is trained and intent predicted.
+        INTENT_CLASSIFICATION: True,
+        # If 'True' named entity recognition is trained and entities predicted.
+        ENTITY_RECOGNITION: True,
+        # If 'True' random tokens of the input message will be masked and the model
+        # should predict those tokens.
+        MASKED_LM: False,
+        # 'BILOU_flag' determines whether to use BILOU tagging or not.
+        # If set to 'True' labelling is more rigorous, however more
+        # examples per entity are required.
+        # Rule of thumb: you should have more than 100 examples per entity.
+        BILOU_FLAG: True,
+    }
+
+    # init helpers
+    def _check_masked_lm(self) -> None:
+        if (
+            self.component_config[MASKED_LM]
+            and self.component_config[NUM_TRANSFORMER_LAYERS] == 0
+        ):
+            raise ValueError(
+                f"If number of transformer layers is 0, "
+                f"'{MASKED_LM}' option should be 'False'."
+            )
+
+    def _check_share_hidden_layers_sizes(self) -> None:
+        if self.component_config.get(SHARE_HIDDEN_LAYERS):
+            first_hidden_layer_sizes = next(
+                iter(self.component_config[HIDDEN_LAYERS_SIZES].values())
+            )
+            # check that all hidden layer sizes are the same
+            identical_hidden_layer_sizes = all(
+                current_hidden_layer_sizes == first_hidden_layer_sizes
+                for current_hidden_layer_sizes in self.component_config[
+                    HIDDEN_LAYERS_SIZES
+                ].values()
+            )
+            if not identical_hidden_layer_sizes:
+                raise ValueError(
+                    f"If hidden layer weights are shared, "
+                    f"{HIDDEN_LAYERS_SIZES} must coincide."
+                )
+
+    def _check_config_parameters(self) -> None:
+        self.component_config = train_utils.check_deprecated_options(
+            self.component_config
+        )
+
+        self._check_masked_lm()
+        self._check_share_hidden_layers_sizes()
+
+        self.component_config = train_utils.update_similarity_type(
+            self.component_config
+        )
+        self.component_config = train_utils.update_evaluation_parameters(
+            self.component_config
+        )
+
+    # package safety checks
+    @classmethod
+    def required_packages(cls) -> List[Text]:
+        return ["tensorflow"]
+
+    def __init__(
+        self,
+        component_config: Optional[Dict[Text, Any]] = None,
+        index_label_id_mapping: Optional[Dict[int, Text]] = None,
+        index_tag_id_mapping: Optional[Dict[int, Text]] = None,
+        model: Optional[RasaModel] = None,
+    ) -> None:
+        """Declare instance variables with default values."""
+
+        if component_config is not None and EPOCHS not in component_config:
+            logger.warning(
+                f"Please configure the number of '{EPOCHS}' in your configuration file."
+                f" We will change the default value of '{EPOCHS}' in the future to 1. "
+            )
+
+        super().__init__(component_config)
+
+        self._check_config_parameters()
+
+        # transform numbers to labels
+        self.index_label_id_mapping = index_label_id_mapping
+        self.index_tag_id_mapping = index_tag_id_mapping
+
+        self.model = model
+
+        self.num_tags: Optional[int] = None  # number of entity tags
+        self._label_data: Optional[RasaModelData] = None
+        self.data_example: Optional[Dict[Text, List[np.ndarray]]] = None
+
+    @property
+    def label_key(self) -> Optional[Text]:
+        return LABEL_IDS if self.component_config[INTENT_CLASSIFICATION] else None
+
+    @staticmethod
+    def model_class() -> Type[RasaModel]:
+        return DIET
+
+    # training data helpers:
+    @staticmethod
+    def _label_id_index_mapping(
+        training_data: TrainingData, attribute: Text
+    ) -> Dict[Text, int]:
+        """Create label_id dictionary."""
+
+        distinct_label_ids = {
+            example.get(attribute) for example in training_data.intent_examples
+        } - {None}
+        return {
+            label_id: idx for idx, label_id in enumerate(sorted(distinct_label_ids))
+        }
+
+    @staticmethod
+    def _invert_mapping(mapping: Dict) -> Dict:
+        return {value: key for key, value in mapping.items()}
+
+    def _tag_id_index_mapping(self, training_data: TrainingData) -> Dict[Text, int]:
+        """Create tag_id dictionary"""
+
+        if self.component_config[BILOU_FLAG]:
+            return bilou_utils.build_tag_id_dict(training_data)
+
+        distinct_tag_ids = set(
+            e["entity"]
+            for example in training_data.entity_examples
+            for e in example.get(ENTITIES)
+        ) - {None}
+
+        tag_id_dict = {
+            tag_id: idx for idx, tag_id in enumerate(sorted(distinct_tag_ids), 1)
+        }
+        # NO_ENTITY_TAG corresponds to non-entity which should correspond to 0 index
+        # needed for correct prediction for padding
+        tag_id_dict[NO_ENTITY_TAG] = 0
+
+        return tag_id_dict
+
+    @staticmethod
+    def _find_example_for_label(
+        label: Text, examples: List[Message], attribute: Text
+    ) -> Optional[Message]:
+        for ex in examples:
+            if ex.get(attribute) == label:
+                return ex
+        return None
+
+    @staticmethod
+    def _check_labels_features_exist(
+        labels_example: List[Message], attribute: Text
+    ) -> bool:
+        """Checks if all labels have features set."""
+
+        return all(
+            label_example.get(SPARSE_FEATURE_NAMES[attribute]) is not None
+            or label_example.get(DENSE_FEATURE_NAMES[attribute]) is not None
+            for label_example in labels_example
+        )
+
+    def _extract_features(
+        self, message: Message, attribute: Text
+    ) -> Tuple[Optional[scipy.sparse.spmatrix], Optional[np.ndarray]]:
+        sparse_features = None
+        dense_features = None
+
+        if message.get(SPARSE_FEATURE_NAMES[attribute]) is not None:
+            sparse_features = message.get(SPARSE_FEATURE_NAMES[attribute])
+
+        if message.get(DENSE_FEATURE_NAMES[attribute]) is not None:
+            dense_features = message.get(DENSE_FEATURE_NAMES[attribute])
+
+        if sparse_features is not None and dense_features is not None:
+            if sparse_features.shape[0] != dense_features.shape[0]:
+                raise ValueError(
+                    f"Sequence dimensions for sparse and dense features "
+                    f"don't coincide in '{message.text}' for attribute '{attribute}'."
+                )
+
+        # If we don't use the transformer and we don't want to do entity recognition,
+        # to speed up training take only the sentence features as feature vector.
+        # It corresponds to the feature vector for the last token - CLS token.
+        # We would not make use of the sequence anyway in this setup. Carrying over
+        # those features to the actual training process takes quite some time.
+        if (
+            self.component_config[NUM_TRANSFORMER_LAYERS] == 0
+            and not self.component_config[ENTITY_RECOGNITION]
+            and attribute != INTENT
+        ):
+            sparse_features = train_utils.sequence_to_sentence_features(sparse_features)
+            dense_features = train_utils.sequence_to_sentence_features(dense_features)
+
+        return sparse_features, dense_features
+
+    def _check_input_dimension_consistency(self, model_data: RasaModelData) -> None:
+        """Checks if features have same dimensionality if hidden layers are shared."""
+
+        if self.component_config.get(SHARE_HIDDEN_LAYERS):
+            num_text_features = model_data.feature_dimension(TEXT_FEATURES)
+            num_label_features = model_data.feature_dimension(LABEL_FEATURES)
+
+            if num_text_features != num_label_features:
+                raise ValueError(
+                    "If embeddings are shared text features and label features "
+                    "must coincide. Check the output dimensions of previous components."
+                )
+
+    def _extract_labels_precomputed_features(
+        self, label_examples: List[Message], attribute: Text = INTENT
+    ) -> List[np.ndarray]:
+        """Collects precomputed encodings."""
+
+        sparse_features = []
+        dense_features = []
+
+        for e in label_examples:
+            _sparse, _dense = self._extract_features(e, attribute)
+            if _sparse is not None:
+                sparse_features.append(_sparse)
+            if _dense is not None:
+                dense_features.append(_dense)
+
+        sparse_features = np.array(sparse_features)
+        dense_features = np.array(dense_features)
+
+        return [sparse_features, dense_features]
+
+    @staticmethod
+    def _compute_default_label_features(
+        labels_example: List[Message],
+    ) -> List[np.ndarray]:
+        """Computes one-hot representation for the labels."""
+
+        eye_matrix = np.eye(len(labels_example), dtype=np.float32)
+        # add sequence dimension to one-hot labels
+        return [np.array([np.expand_dims(a, 0) for a in eye_matrix])]
+
+    def _create_label_data(
+        self,
+        training_data: TrainingData,
+        label_id_dict: Dict[Text, int],
+        attribute: Text,
+    ) -> RasaModelData:
+        """Create matrix with label_ids encoded in rows as bag of words.
+
+        Find a training example for each label and get the encoded features
+        from the corresponding Message object.
+        If the features are already computed, fetch them from the message object
+        else compute a one hot encoding for the label as the feature vector.
+        """
+
+        # Collect one example for each label
+        labels_idx_examples = []
+        for label_name, idx in label_id_dict.items():
+            label_example = self._find_example_for_label(
+                label_name, training_data.intent_examples, attribute
+            )
+            labels_idx_examples.append((idx, label_example))
+
+        # Sort the list of tuples based on label_idx
+        labels_idx_examples = sorted(labels_idx_examples, key=lambda x: x[0])
+        labels_example = [example for (_, example) in labels_idx_examples]
+
+        # Collect features, precomputed if they exist, else compute on the fly
+        if self._check_labels_features_exist(labels_example, attribute):
+            features = self._extract_labels_precomputed_features(
+                labels_example, attribute
+            )
+        else:
+            features = self._compute_default_label_features(labels_example)
+
+        label_data = RasaModelData()
+        label_data.add_features(LABEL_FEATURES, features)
+
+        label_ids = np.array([idx for (idx, _) in labels_idx_examples])
+        # explicitly add last dimension to label_ids
+        # to track correctly dynamic sequences
+        label_data.add_features(LABEL_IDS, [np.expand_dims(label_ids, -1)])
+
+        label_data.add_mask(LABEL_MASK, LABEL_FEATURES)
+
+        return label_data
+
+    def _use_default_label_features(self, label_ids: np.ndarray) -> List[np.ndarray]:
+        all_label_features = self._label_data.get(LABEL_FEATURES)[0]
+        return [np.array([all_label_features[label_id] for label_id in label_ids])]
+
+    def _create_model_data(
+        self,
+        training_data: List[Message],
+        label_id_dict: Optional[Dict[Text, int]] = None,
+        tag_id_dict: Optional[Dict[Text, int]] = None,
+        label_attribute: Optional[Text] = None,
+    ) -> RasaModelData:
+        """Prepare data for training and create a RasaModelData object"""
+
+        X_sparse = []
+        X_dense = []
+        Y_sparse = []
+        Y_dense = []
+        label_ids = []
+        tag_ids = []
+
+        for e in training_data:
+            if label_attribute is None or e.get(label_attribute):
+                _sparse, _dense = self._extract_features(e, TEXT)
+                if _sparse is not None:
+                    X_sparse.append(_sparse)
+                if _dense is not None:
+                    X_dense.append(_dense)
+
+            if e.get(label_attribute):
+                _sparse, _dense = self._extract_features(e, label_attribute)
+                if _sparse is not None:
+                    Y_sparse.append(_sparse)
+                if _dense is not None:
+                    Y_dense.append(_dense)
+
+                if label_id_dict:
+                    label_ids.append(label_id_dict[e.get(label_attribute)])
+
+            if self.component_config.get(ENTITY_RECOGNITION) and tag_id_dict:
+                if self.component_config[BILOU_FLAG]:
+                    _tags = bilou_utils.tags_to_ids(e, tag_id_dict)
+                else:
+                    _tags = []
+                    for t in e.get(TOKENS_NAMES[TEXT]):
+                        _tag = determine_token_labels(t, e.get(ENTITIES), None)
+                        _tags.append(tag_id_dict[_tag])
+                # transpose to have seq_len x 1
+                tag_ids.append(np.array([_tags]).T)
+
+        X_sparse = np.array(X_sparse)
+        X_dense = np.array(X_dense)
+        Y_sparse = np.array(Y_sparse)
+        Y_dense = np.array(Y_dense)
+        label_ids = np.array(label_ids)
+        tag_ids = np.array(tag_ids)
+
+        model_data = RasaModelData(label_key=self.label_key)
+        model_data.add_features(TEXT_FEATURES, [X_sparse, X_dense])
+        model_data.add_features(LABEL_FEATURES, [Y_sparse, Y_dense])
+        if label_attribute and model_data.feature_not_exist(LABEL_FEATURES):
+            # no label features are present, get default features from _label_data
+            model_data.add_features(
+                LABEL_FEATURES, self._use_default_label_features(label_ids)
+            )
+
+        # explicitly add last dimension to label_ids
+        # to track correctly dynamic sequences
+        model_data.add_features(LABEL_IDS, [np.expand_dims(label_ids, -1)])
+        model_data.add_features(TAG_IDS, [tag_ids])
+
+        model_data.add_mask(TEXT_MASK, TEXT_FEATURES)
+        model_data.add_mask(LABEL_MASK, LABEL_FEATURES)
+
+        return model_data
+
+    # train helpers
+    def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
+        """Prepares data for training.
+
+        Performs sanity checks on training data, extracts encodings for labels.
+        """
+
+        if self.component_config[BILOU_FLAG]:
+            bilou_utils.apply_bilou_schema(training_data)
+
+        label_id_index_mapping = self._label_id_index_mapping(
+            training_data, attribute=INTENT
+        )
+
+        if not label_id_index_mapping:
+            # no labels are present to train
+            return RasaModelData()
+
+        self.index_label_id_mapping = self._invert_mapping(label_id_index_mapping)
+
+        self._label_data = self._create_label_data(
+            training_data, label_id_index_mapping, attribute=INTENT
+        )
+
+        tag_id_index_mapping = self._tag_id_index_mapping(training_data)
+        self.index_tag_id_mapping = self._invert_mapping(tag_id_index_mapping)
+
+        label_attribute = (
+            INTENT if self.component_config[INTENT_CLASSIFICATION] else None
+        )
+
+        model_data = self._create_model_data(
+            training_data.training_examples,
+            label_id_index_mapping,
+            tag_id_index_mapping,
+            label_attribute=label_attribute,
+        )
+
+        self.num_tags = len(self.index_tag_id_mapping)
+
+        self._check_input_dimension_consistency(model_data)
+
+        return model_data
+
+    @staticmethod
+    def _check_enough_labels(model_data: RasaModelData) -> bool:
+        return len(np.unique(model_data.get(LABEL_IDS))) >= 2
+
+    def train(
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
+    ) -> None:
+        """Train the embedding intent classifier on a data set."""
+
+        model_data = self.preprocess_train_data(training_data)
+        if model_data.is_empty():
+            logger.debug(
+                f"Cannot train '{self.__class__.__name__}'. No data was provided. "
+                f"Skipping training of the classifier."
+            )
+            return
+
+        if self.component_config.get(INTENT_CLASSIFICATION):
+            if not self._check_enough_labels(model_data):
+                logger.error(
+                    f"Cannot train '{self.__class__.__name__}'. "
+                    f"Need at least 2 different intent classes. "
+                    f"Skipping training of classifier."
+                )
+                return
+
+        # keep one example for persisting and loading
+        self.data_example = self.data_example = model_data.first_data_example()
+
+        self.model = self.model_class()(
+            data_signature=model_data.get_signature(),
+            label_data=self._label_data,
+            index_tag_id_mapping=self.index_tag_id_mapping,
+            config=self.component_config,
+        )
+
+        self.model.fit(
+            model_data,
+            self.component_config[EPOCHS],
+            self.component_config[BATCH_SIZES],
+            self.component_config[EVAL_NUM_EXAMPLES],
+            self.component_config[EVAL_NUM_EPOCHS],
+            self.component_config[BATCH_STRATEGY],
+        )
+
+    # process helpers
+    def _predict(self, message: Message) -> Optional[Dict[Text, tf.Tensor]]:
+        if self.model is None:
+            logger.debug(
+                "There is no trained model: component is either not trained or "
+                "didn't receive enough training data."
+            )
+            return
+
+        # create session data from message and convert it into a batch of 1
+        model_data = self._create_model_data([message])
+
+        return self.model.predict(model_data)
+
+    def _predict_label(
+        self, predict_out: Optional[Dict[Text, tf.Tensor]]
+    ) -> Tuple[Dict[Text, Any], List[Dict[Text, Any]]]:
+        """Predicts the intent of the provided message."""
+
+        label = {"name": None, "confidence": 0.0}
+        label_ranking = []
+
+        if predict_out is None:
+            return label, label_ranking
+
+        message_sim = predict_out["i_scores"].numpy()
+
+        message_sim = message_sim.flatten()  # sim is a matrix
+
+        label_ids = message_sim.argsort()[::-1]
+
+        if (
+            self.component_config[LOSS_TYPE] == SOFTMAX
+            and self.component_config[RANKING_LENGTH] > 0
+        ):
+            message_sim = train_utils.normalize(
+                message_sim, self.component_config[RANKING_LENGTH]
+            )
+
+        message_sim[::-1].sort()
+        message_sim = message_sim.tolist()
+
+        # if X contains all zeros do not predict some label
+        if label_ids.size > 0:
+            label = {
+                "name": self.index_label_id_mapping[label_ids[0]],
+                "confidence": message_sim[0],
+            }
+
+            if (
+                self.component_config[RANKING_LENGTH]
+                and 0 < self.component_config[RANKING_LENGTH] < LABEL_RANKING_LENGTH
+            ):
+                output_length = self.component_config[RANKING_LENGTH]
+            else:
+                output_length = LABEL_RANKING_LENGTH
+
+            ranking = list(zip(list(label_ids), message_sim))
+            ranking = ranking[:output_length]
+            label_ranking = [
+                {"name": self.index_label_id_mapping[label_idx], "confidence": score}
+                for label_idx, score in ranking
+            ]
+
+        return label, label_ranking
+
+    def _predict_entities(
+        self, predict_out: Optional[Dict[Text, tf.Tensor]], message: Message
+    ) -> List[Dict]:
+        if predict_out is None:
+            return []
+
+        # load tf graph and session
+        predictions = predict_out["e_ids"].numpy()
+
+        tags = [self.index_tag_id_mapping[p] for p in predictions[0]]
+
+        if self.component_config[BILOU_FLAG]:
+            tags = bilou_utils.remove_bilou_prefixes(tags)
+
+        entities = self._convert_tags_to_entities(
+            message.text, message.get(TOKENS_NAMES[TEXT], []), tags
+        )
+
+        extracted = self.add_extractor_name(entities)
+        entities = message.get(ENTITIES, []) + extracted
+
+        return entities
+
+    @staticmethod
+    def _convert_tags_to_entities(
+        text: Text, tokens: List[Token], tags: List[Text]
+    ) -> List[Dict[Text, Any]]:
+        entities = []
+        last_tag = NO_ENTITY_TAG
+        for token, tag in zip(tokens, tags):
+            if tag == NO_ENTITY_TAG:
+                last_tag = tag
+                continue
+
+            # new tag found
+            if last_tag != tag:
+                entity = {
+                    "entity": tag,
+                    "start": token.start,
+                    "end": token.end,
+                    "extractor": "DIET",
+                }
+                entities.append(entity)
+
+            # belongs to last entity
+            elif last_tag == tag:
+                entities[-1]["end"] = token.end
+
+            last_tag = tag
+
+        for entity in entities:
+            entity["value"] = text[entity["start"] : entity["end"]]
+
+        return entities
+
+    def process(self, message: Message, **kwargs: Any) -> None:
+        """Return the most likely label and its similarity to the input."""
+
+        out = self._predict(message)
+
+        if self.component_config[INTENT_CLASSIFICATION]:
+            label, label_ranking = self._predict_label(out)
+
+            message.set(INTENT, label, add_to_output=True)
+            message.set("intent_ranking", label_ranking, add_to_output=True)
+
+        if self.component_config[ENTITY_RECOGNITION]:
+            entities = self._predict_entities(out, message)
+
+            message.set(ENTITIES, entities, add_to_output=True)
+
+    def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
+        """Persist this model into the passed directory.
+
+        Return the metadata necessary to load the model again.
+        """
+
+        if self.model is None:
+            return {"file": None}
+
+        model_dir = Path(model_dir)
+        tf_model_file = model_dir / f"{file_name}.tf_model"
+
+        io_utils.create_directory_for_file(tf_model_file)
+
+        self.model.save(str(tf_model_file))
+
+        io_utils.pickle_dump(
+            model_dir / f"{file_name}.data_example.pkl", self.data_example
+        )
+        io_utils.pickle_dump(
+            model_dir / f"{file_name}.label_data.pkl", self._label_data
+        )
+        io_utils.json_pickle(
+            model_dir / f"{file_name}.index_label_id_mapping.pkl",
+            self.index_label_id_mapping,
+        )
+        io_utils.json_pickle(
+            model_dir / f"{file_name}.index_tag_id_mapping.pkl",
+            self.index_tag_id_mapping,
+        )
+
+        return {"file": file_name}
+
+    @classmethod
+    def load(
+        cls,
+        meta: Dict[Text, Any],
+        model_dir: Text = None,
+        model_metadata: Metadata = None,
+        cached_component: Optional["DIETClassifier"] = None,
+        **kwargs: Any,
+    ) -> "DIETClassifier":
+        """Loads the trained model from the provided directory."""
+
+        if not model_dir or not meta.get("file"):
+            logger.debug(
+                f"Failed to load model. "
+                f"Maybe the path '{os.path.abspath(model_dir)}' doesn't exist?"
+            )
+            return cls(component_config=meta)
+
+        (
+            index_label_id_mapping,
+            index_tag_id_mapping,
+            label_data,
+            meta,
+            data_example,
+        ) = cls._load_from_files(meta, model_dir)
+
+        meta = train_utils.update_similarity_type(meta)
+
+        model = cls._load_model(
+            index_tag_id_mapping, label_data, meta, data_example, model_dir
+        )
+
+        return cls(
+            component_config=meta,
+            index_label_id_mapping=index_label_id_mapping,
+            index_tag_id_mapping=index_tag_id_mapping,
+            model=model,
+        )
+
+    @classmethod
+    def _load_from_files(cls, meta: Dict[Text, Any], model_dir: Text):
+        file_name = meta.get("file")
+
+        model_dir = Path(model_dir)
+
+        data_example = io_utils.pickle_load(model_dir / f"{file_name}.data_example.pkl")
+        label_data = io_utils.pickle_load(model_dir / f"{file_name}.label_data.pkl")
+        index_label_id_mapping = io_utils.json_unpickle(
+            model_dir / f"{file_name}.index_label_id_mapping.pkl"
+        )
+        index_tag_id_mapping = io_utils.json_unpickle(
+            model_dir / f"{file_name}.index_tag_id_mapping.pkl"
+        )
+
+        # jsonpickle converts dictionary keys to strings
+        index_label_id_mapping = {
+            int(key): value for key, value in index_label_id_mapping.items()
+        }
+        if index_tag_id_mapping is not None:
+            index_tag_id_mapping = {
+                int(key): value for key, value in index_tag_id_mapping.items()
+            }
+
+        return (
+            index_label_id_mapping,
+            index_tag_id_mapping,
+            label_data,
+            meta,
+            data_example,
+        )
+
+    @classmethod
+    def _load_model(
+        cls,
+        index_tag_id_mapping: Dict[int, Text],
+        label_data: RasaModelData,
+        meta: Dict[Text, Any],
+        data_example: Dict[Text, List[np.ndarray]],
+        model_dir: Text,
+    ):
+        file_name = meta.get("file")
+        tf_model_file = os.path.join(model_dir, file_name + ".tf_model")
+
+        label_key = LABEL_IDS if meta[INTENT_CLASSIFICATION] else None
+        model_data_example = RasaModelData(label_key=label_key, data=data_example)
+
+        model = cls.model_class().load(
+            tf_model_file,
+            model_data_example,
+            data_signature=model_data_example.get_signature(),
+            label_data=label_data,
+            index_tag_id_mapping=index_tag_id_mapping,
+            config=meta,
+        )
+
+        # build the graph for prediction
+        predict_data_example = RasaModelData(
+            label_key=label_key,
+            data={
+                feature_name: features
+                for feature_name, features in model_data_example.items()
+                if TEXT in feature_name
+            },
+        )
+
+        model.build_for_predict(predict_data_example)
+
+        return model
+
+
+# accessing _tf_layers with any key results in key-error, disable it
+# pytype: disable=key-error
+
+
+class DIET(RasaModel):
+    def __init__(
+        self,
+        data_signature: Dict[Text, List[FeatureSignature]],
+        label_data: RasaModelData,
+        index_tag_id_mapping: Optional[Dict[int, Text]],
+        config: Dict[Text, Any],
+    ) -> None:
+        super().__init__(name="DIET", random_seed=config[RANDOM_SEED])
+
+        self.config = config
+
+        self.data_signature = data_signature
+        self._check_data()
+
+        self.predict_data_signature = {
+            feature_name: features
+            for feature_name, features in data_signature.items()
+            if TEXT in feature_name
+        }
+
+        label_batch = label_data.prepare_batch()
+        self.tf_label_data = self.batch_to_model_data_format(
+            label_batch, label_data.get_signature()
+        )
+        self._num_tags = (
+            len(index_tag_id_mapping) if index_tag_id_mapping is not None else 0
+        )
+
+        # tf objects
+        self._tf_layers: Dict[Text : tf.keras.layers.Layer] = {}
+        self._prepare_layers()
+
+        # tf training
+        self._set_optimizer(tf.keras.optimizers.Adam(config[LEARNING_RATE]))
+        self._create_metrics()
+        self._update_metrics_to_log()
+
+        self.all_labels_embed = None  # needed for efficient prediction
+
+    def _check_data(self) -> None:
+        if TEXT_FEATURES not in self.data_signature:
+            raise InvalidConfigError(
+                f"No text features specified. "
+                f"Cannot train '{self.__class__.__name__}' model."
+            )
+        if self.config[INTENT_CLASSIFICATION]:
+            if LABEL_FEATURES not in self.data_signature:
+                raise InvalidConfigError(
+                    f"No label features specified. "
+                    f"Cannot train '{self.__class__.__name__}' model."
+                )
+            if (
+                self.config[SHARE_HIDDEN_LAYERS]
+                and self.data_signature[TEXT_FEATURES]
+                != self.data_signature[LABEL_FEATURES]
+            ):
+                raise ValueError(
+                    "If hidden layer weights are shared, data signatures "
+                    "for text_features and label_features must coincide."
+                )
+
+        if self.config[ENTITY_RECOGNITION] and TAG_IDS not in self.data_signature:
+            raise ValueError(
+                f"No tag ids present. "
+                f"Cannot train '{self.__class__.__name__}' model."
+            )
+
+    def _create_metrics(self) -> None:
+        # self.metrics will have the same order as they are created
+        # so create loss metrics first to output losses first
+        self.mask_loss = tf.keras.metrics.Mean(name="m_loss")
+        self.intent_loss = tf.keras.metrics.Mean(name="i_loss")
+        self.entity_loss = tf.keras.metrics.Mean(name="e_loss")
+        # create accuracy metrics second to output accuracies second
+        self.mask_acc = tf.keras.metrics.Mean(name="m_acc")
+        self.response_acc = tf.keras.metrics.Mean(name="i_acc")
+        self.entity_f1 = tf.keras.metrics.Mean(name="e_f1")
+
+    def _update_metrics_to_log(self) -> None:
+        if self.config[MASKED_LM]:
+            self.metrics_to_log += ["m_loss", "m_acc"]
+        if self.config[INTENT_CLASSIFICATION]:
+            self.metrics_to_log += ["i_loss", "i_acc"]
+        if self.config[ENTITY_RECOGNITION]:
+            self.metrics_to_log += ["e_loss", "e_f1"]
+
+    def _prepare_layers(self) -> None:
+        self.text_name = TEXT
+        self._prepare_sequence_layers(self.text_name)
+        if self.config[MASKED_LM]:
+            self._prepare_mask_lm_layers(self.text_name)
+        if self.config[INTENT_CLASSIFICATION]:
+            self.label_name = TEXT if self.config[SHARE_HIDDEN_LAYERS] else LABEL
+            self._prepare_input_layers(self.label_name)
+            self._prepare_label_classification_layers()
+        if self.config[ENTITY_RECOGNITION]:
+            self._prepare_entity_recognition_layers()
+
+    def _prepare_sparse_dense_layers(
+        self,
+        feature_signatures: List[FeatureSignature],
+        name: Text,
+        reg_lambda: float,
+        dense_dim: int,
+    ) -> None:
+        sparse = False
+        dense = False
+        for is_sparse, shape in feature_signatures:
+            if is_sparse:
+                sparse = True
+            else:
+                dense = True
+                # if dense features are present
+                # use the feature dimension of the dense features
+                dense_dim = shape[-1]
+
+        if sparse:
+            self._tf_layers[f"sparse_to_dense.{name}"] = layers.DenseForSparse(
+                units=dense_dim, reg_lambda=reg_lambda, name=name
+            )
+            if not dense:
+                # create dense labels for the input to use in negative sampling
+                self._tf_layers[f"sparse_to_dense_ids.{name}"] = layers.DenseForSparse(
+                    units=2, trainable=False, name=f"sparse_to_dense_ids.{name}"
+                )
+
+    def _prepare_input_layers(self, name: Text) -> None:
+        self._tf_layers[f"sparse_dropout.{name}"] = layers.SparseDropout(
+            rate=self.config[DROP_RATE]
+        )
+        self._prepare_sparse_dense_layers(
+            self.data_signature[f"{name}_features"],
+            name,
+            self.config[REGULARIZATION_CONSTANT],
+            self.config[DENSE_DIMENSION][name],
+        )
+        self._tf_layers[f"ffnn.{name}"] = layers.Ffnn(
+            self.config[HIDDEN_LAYERS_SIZES][name],
+            self.config[DROP_RATE],
+            self.config[REGULARIZATION_CONSTANT],
+            self.config[WEIGHT_SPARSITY],
+            name,
+        )
+
+    def _prepare_embed_layers(self, name: Text) -> None:
+        self._tf_layers[f"embed.{name}"] = layers.Embed(
+            self.config[EMBEDDING_DIMENSION],
+            self.config[REGULARIZATION_CONSTANT],
+            name,
+            self.config[SIMILARITY_TYPE],
+        )
+
+    def _prepare_dot_product_loss(self, name: Text) -> None:
+        self._tf_layers[f"loss.{name}"] = layers.DotProductLoss(
+            self.config[NUM_NEG],
+            self.config[LOSS_TYPE],
+            self.config[MAX_POS_SIM],
+            self.config[MAX_NEG_SIM],
+            self.config[USE_MAX_NEG_SIM],
+            self.config[NEGATIVE_MARGIN_SCALE],
+            self.config[SCALE_LOSS],
+            # set to 1 to get deterministic behaviour
+            parallel_iterations=1 if self.random_seed is not None else 1000,
+        )
+
+    def _prepare_sequence_layers(self, name: Text) -> None:
+        self._prepare_input_layers(name)
+
+        if self.config[NUM_TRANSFORMER_LAYERS] > 0:
+            self._tf_layers[f"{name}_transformer"] = TransformerEncoder(
+                self.config[NUM_TRANSFORMER_LAYERS],
+                self.config[TRANSFORMER_SIZE],
+                self.config[NUM_HEADS],
+                self.config[TRANSFORMER_SIZE] * 4,
+                self.config[REGULARIZATION_CONSTANT],
+                dropout_rate=self.config[DROP_RATE],
+                attention_dropout_rate=self.config[DROP_RATE_ATTENTION],
+                sparsity=self.config[WEIGHT_SPARSITY],
+                unidirectional=self.config[UNIDIRECTIONAL_ENCODER],
+                use_key_relative_position=self.config[KEY_RELATIVE_ATTENTION],
+                use_value_relative_position=self.config[VALUE_RELATIVE_ATTENTION],
+                max_relative_position=self.config[MAX_RELATIVE_POSITION],
+                name=f"{name}_encoder",
+            )
+        else:
+            # create lambda so that it can be used later without the check
+            self._tf_layers[f"{name}_transformer"] = lambda x, mask, training: x
+
+    def _prepare_mask_lm_layers(self, name: Text) -> None:
+        self._tf_layers[f"{name}_input_mask"] = layers.InputMask()
+
+        self._prepare_embed_layers(f"{name}_lm_mask")
+        self._prepare_embed_layers(f"{name}_golden_token")
+
+        self._prepare_dot_product_loss(f"{name}_mask")
+
+    def _prepare_label_classification_layers(self) -> None:
+        self._prepare_embed_layers(TEXT)
+        self._prepare_embed_layers(LABEL)
+
+        self._prepare_dot_product_loss(LABEL)
+
+    def _prepare_entity_recognition_layers(self) -> None:
+        self._tf_layers["embed.logits"] = layers.Embed(
+            self._num_tags, self.config[REGULARIZATION_CONSTANT], "logits"
+        )
+        self._tf_layers["crf"] = layers.CRF(
+            self._num_tags, self.config[REGULARIZATION_CONSTANT]
+        )
+        self._tf_layers["crf_f1_score"] = tfa.metrics.F1Score(
+            num_classes=self._num_tags - 1,  # `0` prediction is not a prediction
+            average="micro",
+        )
+
+    @staticmethod
+    def _get_sequence_lengths(mask: tf.Tensor) -> tf.Tensor:
+        return tf.cast(tf.reduce_sum(mask[:, :, 0], axis=1), tf.int32)
+
+    def _combine_sparse_dense_features(
+        self,
+        features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]],
+        mask: tf.Tensor,
+        name: Text,
+        sparse_dropout: bool = False,
+    ) -> tf.Tensor:
+
+        dense_features = []
+
+        for f in features:
+            if isinstance(f, tf.SparseTensor):
+                if sparse_dropout:
+                    _f = self._tf_layers[f"sparse_dropout.{name}"](f, self._training)
+                else:
+                    _f = f
+                dense_features.append(self._tf_layers[f"sparse_to_dense.{name}"](_f))
+            else:
+                dense_features.append(f)
+
+        return tf.concat(dense_features, axis=-1) * mask
+
+    def _features_as_seq_ids(
+        self, features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]], name: Text
+    ) -> tf.Tensor:
+        """Creates dense labels for negative sampling."""
+
+        # if there are dense features - we can use them
+        for f in features:
+            if not isinstance(f, tf.SparseTensor):
+                return tf.stop_gradient(f)
+
+        # use additional sparse to dense layer
+        for f in features:
+            if isinstance(f, tf.SparseTensor):
+                return tf.stop_gradient(
+                    self._tf_layers[f"sparse_to_dense_ids.{name}"](f)
+                )
+
+    def _create_bow(
+        self,
+        features: List[Union[tf.Tensor, tf.SparseTensor]],
+        mask: tf.Tensor,
+        name: Text,
+        sparse_dropout: bool = False,
+    ) -> tf.Tensor:
+
+        x = self._combine_sparse_dense_features(features, mask, name, sparse_dropout)
+        x = tf.reduce_sum(x, axis=1)  # convert to bag-of-words
+        return self._tf_layers[f"ffnn.{name}"](x, self._training)
+
+    def _create_sequence(
+        self,
+        features: List[Union[tf.Tensor, tf.SparseTensor]],
+        mask: tf.Tensor,
+        name: Text,
+        masked_lm_loss: bool = False,
+        sequence_ids: bool = False,
+    ) -> Tuple[tf.Tensor, tf.Tensor, Optional[tf.Tensor], Optional[tf.Tensor]]:
+        if sequence_ids:
+            seq_ids = self._features_as_seq_ids(features, name)
+        else:
+            seq_ids = None
+
+        inputs = self._combine_sparse_dense_features(
+            features, mask, name, sparse_dropout=self.config[SPARSE_INPUT_DROPOUT]
+        )
+
+        x = self._tf_layers[f"ffnn.{name}"](inputs, self._training)
+
+        if masked_lm_loss:
+            x, lm_mask_bool = self._tf_layers[f"{name}_input_mask"](
+                x, mask, self._training
+            )
+        else:
+            lm_mask_bool = None
+
+        outputs = self._tf_layers[f"{name}_transformer"](x, 1 - mask, self._training)
+        outputs = tfa.activations.gelu(outputs)
+
+        return outputs, inputs, seq_ids, lm_mask_bool
+
+    def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
+        all_label_ids = self.tf_label_data[LABEL_IDS][0]
+        x = self._create_bow(
+            self.tf_label_data[LABEL_FEATURES],
+            self.tf_label_data[LABEL_MASK][0],
+            self.label_name,
+        )
+        all_labels_embed = self._tf_layers[f"embed.{LABEL}"](x)
+
+        return all_label_ids, all_labels_embed
+
+    @staticmethod
+    def _last_token(x: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:
+        last_sequence_index = tf.maximum(0, sequence_lengths - 1)
+        batch_index = tf.range(tf.shape(last_sequence_index)[0])
+
+        indices = tf.stack([batch_index, last_sequence_index], axis=1)
+        return tf.gather_nd(x, indices)
+
+    def _f1_score_from_ids(
+        self, tag_ids: tf.Tensor, pred_ids: tf.Tensor, mask: tf.Tensor
+    ) -> tf.Tensor:
+        """Calculates f1 score for train predictions"""
+
+        mask_bool = tf.cast(mask[:, :, 0], tf.bool)
+        # pick only non padding values and flatten sequences
+        tag_ids_flat = tf.boolean_mask(tag_ids, mask_bool)
+        pred_ids_flat = tf.boolean_mask(pred_ids, mask_bool)
+        # set `0` prediction to not a prediction
+        tag_ids_flat_one_hot = tf.one_hot(tag_ids_flat - 1, self._num_tags - 1)
+        pred_ids_flat_one_hot = tf.one_hot(pred_ids_flat - 1, self._num_tags - 1)
+
+        return self._tf_layers["crf_f1_score"](
+            tag_ids_flat_one_hot, pred_ids_flat_one_hot
+        )
+
+    def _mask_loss(
+        self,
+        outputs: tf.Tensor,
+        inputs: tf.Tensor,
+        seq_ids: tf.Tensor,
+        lm_mask_bool: tf.Tensor,
+        name: Text,
+    ) -> tf.Tensor:
+        # make sure there is at least one element in the mask
+        lm_mask_bool = tf.cond(
+            tf.reduce_any(lm_mask_bool),
+            lambda: lm_mask_bool,
+            lambda: tf.scatter_nd([[0, 0, 0]], [True], tf.shape(lm_mask_bool)),
+        )
+
+        lm_mask_bool = tf.squeeze(lm_mask_bool, -1)
+        # pick elements that were masked
+        outputs = tf.boolean_mask(outputs, lm_mask_bool)
+        inputs = tf.boolean_mask(inputs, lm_mask_bool)
+        ids = tf.boolean_mask(seq_ids, lm_mask_bool)
+
+        outputs_embed = self._tf_layers[f"embed.{name}_lm_mask"](outputs)
+        inputs_embed = self._tf_layers[f"embed.{name}_golden_token"](inputs)
+
+        return self._tf_layers[f"loss.{name}_mask"](
+            outputs_embed, inputs_embed, ids, inputs_embed, ids
+        )
+
+    def _calculate_label_loss(
+        self, a: tf.Tensor, b: tf.Tensor, label_ids: tf.Tensor
+    ) -> tf.Tensor:
+        all_label_ids, all_labels_embed = self._create_all_labels()
+
+        a_embed = self._tf_layers[f"embed.{TEXT}"](a)
+        b_embed = self._tf_layers[f"embed.{LABEL}"](b)
+
+        return self._tf_layers[f"loss.{LABEL}"](
+            a_embed, b_embed, label_ids, all_labels_embed, all_label_ids
+        )
+
+    def _calculate_entity_loss(
+        self,
+        outputs: tf.Tensor,
+        tag_ids: tf.Tensor,
+        mask: tf.Tensor,
+        sequence_lengths: tf.Tensor,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+
+        sequence_lengths = sequence_lengths - 1  # remove cls token
+        tag_ids = tf.cast(tag_ids[:, :, 0], tf.int32)
+        logits = self._tf_layers["embed.logits"](outputs)
+
+        # should call first to build weights
+        pred_ids = self._tf_layers["crf"](logits, sequence_lengths)
+        # pytype cannot infer that 'self._tf_layers["crf"]' has the method '.loss'
+        # pytype: disable=attribute-error
+        loss = self._tf_layers["crf"].loss(logits, tag_ids, sequence_lengths)
+        # pytype: enable=attribute-error
+
+        f1 = self._f1_score_from_ids(tag_ids, pred_ids, mask)
+
+        return loss, f1
+
+    def batch_loss(
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
+    ) -> tf.Tensor:
+        tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
+
+        mask_text = tf_batch_data[TEXT_MASK][0]
+        sequence_lengths = self._get_sequence_lengths(mask_text)
+
+        (
+            text_transformed,
+            text_in,
+            text_seq_ids,
+            lm_mask_bool_text,
+        ) = self._create_sequence(
+            tf_batch_data[TEXT_FEATURES],
+            mask_text,
+            self.text_name,
+            self.config[MASKED_LM],
+            sequence_ids=True,
+        )
+
+        losses = []
+
+        if self.config[MASKED_LM]:
+            loss, acc = self._mask_loss(
+                text_transformed, text_in, text_seq_ids, lm_mask_bool_text, TEXT
+            )
+            self.mask_loss.update_state(loss)
+            self.mask_acc.update_state(acc)
+            losses.append(loss)
+
+        if self.config[INTENT_CLASSIFICATION]:
+            # get _cls_ vector for intent classification
+            cls = self._last_token(text_transformed, sequence_lengths)
+
+            label_ids = tf_batch_data[LABEL_IDS][0]
+            label = self._create_bow(
+                tf_batch_data[LABEL_FEATURES],
+                tf_batch_data[LABEL_MASK][0],
+                self.label_name,
+            )
+            loss, acc = self._calculate_label_loss(cls, label, label_ids)
+            self.intent_loss.update_state(loss)
+            self.response_acc.update_state(acc)
+            losses.append(loss)
+
+        if self.config[ENTITY_RECOGNITION]:
+            tag_ids = tf_batch_data[TAG_IDS][0]
+
+            loss, f1 = self._calculate_entity_loss(
+                text_transformed, tag_ids, mask_text, sequence_lengths
+            )
+            self.entity_loss.update_state(loss)
+            self.entity_f1.update_state(f1)
+            losses.append(loss)
+
+        return tf.math.add_n(losses)
+
+    def batch_predict(
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
+    ) -> Dict[Text, tf.Tensor]:
+        tf_batch_data = self.batch_to_model_data_format(
+            batch_in, self.predict_data_signature
+        )
+
+        mask_text = tf_batch_data[TEXT_MASK][0]
+        sequence_lengths = self._get_sequence_lengths(mask_text)
+
+        text_transformed, _, _, _ = self._create_sequence(
+            tf_batch_data[TEXT_FEATURES], mask_text, self.text_name
+        )
+
+        out = {}
+        if self.config[INTENT_CLASSIFICATION]:
+            if self.all_labels_embed is None:
+                _, self.all_labels_embed = self._create_all_labels()
+
+            # get _cls_ vector for intent classification
+            cls = self._last_token(text_transformed, sequence_lengths)
+            cls_embed = self._tf_layers[f"embed.{TEXT}"](cls)
+
+            # pytype cannot infer that 'self._tf_layers[f"loss.{LABEL}"]' has methods
+            # like '.sim' or '.confidence_from_sim'
+            # pytype: disable=attribute-error
+            sim_all = self._tf_layers[f"loss.{LABEL}"].sim(
+                cls_embed[:, tf.newaxis, :], self.all_labels_embed[tf.newaxis, :, :]
+            )
+            scores = self._tf_layers[f"loss.{LABEL}"].confidence_from_sim(
+                sim_all, self.config[SIMILARITY_TYPE]
+            )
+            # pytype: enable=attribute-error
+            out["i_scores"] = scores
+
+        if self.config[ENTITY_RECOGNITION]:
+            logits = self._tf_layers["embed.logits"](text_transformed)
+            pred_ids = self._tf_layers["crf"](logits, sequence_lengths - 1)
+            out["e_ids"] = pred_ids
+
+        return out
+
+
+# pytype: enable=key-error
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 36e89478cb22..43b485df5e3c 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1,1006 +1,163 @@
 import logging
-
-import numpy as np
-import os
-import pickle
-import scipy.sparse
-import typing
-from typing import Any, Dict, List, Optional, Text, Tuple, Union
-
-from rasa.nlu.featurizers.featurizer import sequence_to_sentence_features
-from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
-from rasa.nlu.components import Component, any_of
-from rasa.utils import train_utils
-from rasa.utils.train_utils import SessionDataType
-from rasa.nlu.constants import (
-    INTENT_ATTRIBUTE,
-    TEXT_ATTRIBUTE,
-    SPARSE_FEATURE_NAMES,
-    DENSE_FEATURE_NAMES,
+from typing import Any, Dict, Optional, Text, List, Type
+
+from rasa.constants import DOCS_URL_MIGRATION_GUIDE
+from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.components import Component
+from rasa.nlu.classifiers.diet_classifier import DIETClassifier
+from rasa.nlu.constants import TEXT
+from rasa.utils.tensorflow.constants import (
+    LABEL,
+    HIDDEN_LAYERS_SIZES,
+    SHARE_HIDDEN_LAYERS,
+    NUM_TRANSFORMER_LAYERS,
+    BATCH_SIZES,
+    BATCH_STRATEGY,
+    EPOCHS,
+    RANDOM_SEED,
+    LEARNING_RATE,
+    DENSE_DIMENSION,
+    RANKING_LENGTH,
+    LOSS_TYPE,
+    SIMILARITY_TYPE,
+    NUM_NEG,
+    SPARSE_INPUT_DROPOUT,
+    MASKED_LM,
+    ENTITY_RECOGNITION,
+    INTENT_CLASSIFICATION,
+    EVAL_NUM_EXAMPLES,
+    EVAL_NUM_EPOCHS,
+    DROP_RATE,
+    WEIGHT_SPARSITY,
+    NEGATIVE_MARGIN_SCALE,
+    REGULARIZATION_CONSTANT,
+    SCALE_LOSS,
+    USE_MAX_NEG_SIM,
+    MAX_NEG_SIM,
+    MAX_POS_SIM,
+    EMBEDDING_DIMENSION,
+    BILOU_FLAG,
+    SOFTMAX,
+    AUTO,
+    BALANCED,
 )
-
-import tensorflow as tf
-
-# avoid warning println on contrib import - remove for tf 2
-from rasa.utils.common import raise_warning
-
-tf.contrib._warning = None
+import rasa.utils.common as common_utils
+from rasa.utils.tensorflow.models import RasaModel
 
 logger = logging.getLogger(__name__)
 
-if typing.TYPE_CHECKING:
-    from rasa.nlu.config import RasaNLUModelConfig
-    from rasa.nlu.training_data import TrainingData
-    from rasa.nlu.model import Metadata
-    from rasa.nlu.training_data import Message
-
 
-class EmbeddingIntentClassifier(Component):
-    """Intent classifier using supervised embeddings.
+class EmbeddingIntentClassifier(DIETClassifier):
+    """Dual Intent Entity Transformer used for intent classification.
 
-    The embedding intent classifier embeds user inputs
-    and intent labels into the same space.
+    The ``EmbeddingIntentClassifier`` embeds user inputs and intent labels into the
+    same space.
     Supervised embeddings are trained by maximizing similarity between them.
-    It also provides rankings of the labels that did not "win".
-
-    The embedding intent classifier needs to be preceded by
-    a featurizer in the pipeline.
-    This featurizer creates the features used for the embeddings.
-    It is recommended to use ``CountVectorsFeaturizer`` that
-    can be optionally preceded by ``SpacyNLP`` and ``SpacyTokenizer``.
-
-    Based on the starspace idea from: https://arxiv.org/abs/1709.03856.
-    However, in this implementation the `mu` parameter is treated differently
-    and additional hidden layers are added together with dropout.
+    This algorithm is based on `StarSpace <https://arxiv.org/abs/1709.03856>`_.
+    However, in this implementation the loss function is slightly different and
+    additional hidden layers are added together with dropout.
+    This algorithm also provides similarity rankings of the labels that did not "win".
     """
 
-    provides = ["intent", "intent_ranking"]
-
-    requires = [
-        any_of(
-            DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE], SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
-        )
-    ]
+    @classmethod
+    def required_components(cls) -> List[Type[Component]]:
+        return [Featurizer]
 
-    # default properties (DOC MARKER - don't remove)
+    # please make sure to update the docs when changing a default parameter
     defaults = {
-        # nn architecture
-        # sizes of hidden layers before the embedding layer for input words
-        # the number of hidden layers is thus equal to the length of this list
-        "hidden_layers_sizes_a": [256, 128],
-        # sizes of hidden layers before the embedding layer for intent labels
-        # the number of hidden layers is thus equal to the length of this list
-        "hidden_layers_sizes_b": [],
-        # Whether to share the hidden layer weights between input words and labels
-        "share_hidden_layers": False,
-        # training parameters
-        # initial and final batch sizes - batch size will be
-        # linearly increased for each epoch
-        "batch_size": [64, 256],
-        # how to create batches
-        "batch_strategy": "balanced",  # string 'sequence' or 'balanced'
-        # number of epochs
-        "epochs": 300,
-        # set random seed to any int to get reproducible results
-        "random_seed": None,
-        # embedding parameters
-        # default dense dimension used if no dense features are present
-        "dense_dim": {"text": 512, "label": 20},
-        # dimension size of embedding vectors
-        "embed_dim": 20,
-        # the type of the similarity
-        "num_neg": 20,
-        # flag if minimize only maximum similarity over incorrect actions
-        "similarity_type": "auto",  # string 'auto' or 'cosine' or 'inner'
-        # the type of the loss function
-        "loss_type": "softmax",  # string 'softmax' or 'margin'
-        # number of top intents to normalize scores for softmax loss_type
-        # set to 0 to turn off normalization
-        "ranking_length": 10,
-        # how similar the algorithm should try
-        # to make embedding vectors for correct labels
-        "mu_pos": 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
-        # maximum negative similarity for incorrect labels
-        "mu_neg": -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
-        # flag: if true, only minimize the maximum similarity for incorrect labels
-        "use_max_sim_neg": True,
-        # scale loss inverse proportionally to confidence of correct prediction
-        "scale_loss": True,
-        # regularization parameters
-        # the scale of L2 regularization
-        "C2": 0.002,
-        # the scale of how critical the algorithm should be of minimizing the
-        # maximum similarity between embeddings of different labels
-        "C_emb": 0.8,
-        # dropout rate for rnn
-        "droprate": 0.2,
-        # visualization of accuracy
-        # how often to calculate training accuracy
-        "evaluate_every_num_epochs": 20,  # small values may hurt performance
-        # how many examples to use for calculation of training accuracy
-        "evaluate_on_num_examples": 0,  # large values may hurt performance
+        # ## Architecture of the used neural network
+        # Hidden layer sizes for layers before the embedding layers for user message
+        # and labels.
+        # The number of hidden layers is equal to the length of the corresponding
+        # list.
+        HIDDEN_LAYERS_SIZES: {TEXT: [256, 128], LABEL: []},
+        # Whether to share the hidden layer weights between user message and labels.
+        SHARE_HIDDEN_LAYERS: False,
+        # ## Training parameters
+        # Initial and final batch sizes:
+        # Batch size will be linearly increased for each epoch.
+        BATCH_SIZES: [64, 256],
+        # Strategy used when creating batches.
+        # Can be either 'sequence' or 'balanced'.
+        BATCH_STRATEGY: BALANCED,
+        # Number of epochs to train
+        EPOCHS: 300,
+        # Set random seed to any 'int' to get reproducible results
+        RANDOM_SEED: None,
+        # Initial learning rate for the optimizer
+        LEARNING_RATE: 0.001,
+        # ## Parameters for embeddings
+        # Dimension size of embedding vectors
+        EMBEDDING_DIMENSION: 20,
+        # Default dense dimension to use if no dense features are present.
+        DENSE_DIMENSION: {TEXT: 512, LABEL: 20},
+        # The number of incorrect labels. The algorithm will minimize
+        # their similarity to the user input during training.
+        NUM_NEG: 20,
+        # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
+        SIMILARITY_TYPE: AUTO,
+        # The type of the loss function, either 'softmax' or 'margin'.
+        LOSS_TYPE: SOFTMAX,
+        # Number of top actions to normalize scores for loss type 'softmax'.
+        # Set to 0 to turn off normalization.
+        RANKING_LENGTH: 10,
+        # Indicates how similar the algorithm should try to make embedding vectors
+        # for correct labels.
+        # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_POS_SIM: 0.8,
+        # Maximum negative similarity for incorrect labels.
+        # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_NEG_SIM: -0.4,
+        # If 'True' the algorithm only minimizes maximum similarity over
+        # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
+        USE_MAX_NEG_SIM: True,
+        # Scale loss inverse proportionally to confidence of correct prediction
+        SCALE_LOSS: True,
+        # ## Regularization parameters
+        # The scale of regularization
+        REGULARIZATION_CONSTANT: 0.002,
+        # The scale of how important is to minimize the maximum similarity
+        # between embeddings of different labels.
+        NEGATIVE_MARGIN_SCALE: 0.8,
+        # Dropout rate for encoder
+        DROP_RATE: 0.2,
+        # Sparsity of the weights in dense layers
+        WEIGHT_SPARSITY: 0.8,
+        # If 'True' apply dropout to sparse tensors
+        SPARSE_INPUT_DROPOUT: False,
+        # ## Evaluation parameters
+        # How often calculate validation accuracy.
+        # Small values may hurt performance, e.g. model accuracy.
+        EVAL_NUM_EPOCHS: 20,
+        # How many examples to use for hold out validation set
+        # Large values may hurt performance, e.g. model accuracy.
+        EVAL_NUM_EXAMPLES: 0,
     }
-    # end default properties (DOC MARKER - don't remove)
-
-    @staticmethod
-    def _check_old_config_variables(config: Dict[Text, Any]) -> None:
-        """Config migration warning"""
-
-        removed_tokenization_params = [
-            "intent_tokenization_flag",
-            "intent_split_symbol",
-        ]
-        for removed_param in removed_tokenization_params:
-            if removed_param in config:
-                raise_warning(
-                    f"Intent tokenization has been moved to Tokenizer components. "
-                    f"Your config still mentions '{removed_param}'. "
-                    f"Tokenization may fail if you specify the parameter here. "
-                    f"Please specify the parameter 'intent_tokenization_flag' "
-                    f"and 'intent_split_symbol' in the "
-                    f"tokenizer of your NLU pipeline",
-                    FutureWarning,
-                )
-
-    # init helpers
-    def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
-        self.hidden_layer_sizes = {
-            "text": config["hidden_layers_sizes_a"],
-            "label": config["hidden_layers_sizes_b"],
-        }
-        self.share_hidden_layers = config["share_hidden_layers"]
-        if (
-            self.share_hidden_layers
-            and self.hidden_layer_sizes["text"] != self.hidden_layer_sizes["label"]
-        ):
-            raise ValueError(
-                "If hidden layer weights are shared,"
-                "hidden_layer_sizes for a and b must coincide."
-            )
-
-        self.batch_in_size = config["batch_size"]
-        self.batch_in_strategy = config["batch_strategy"]
-
-        self.epochs = config["epochs"]
-
-        self.random_seed = self.component_config["random_seed"]
-
-    def _load_embedding_params(self, config: Dict[Text, Any]) -> None:
-        self.embed_dim = config["embed_dim"]
-        self.num_neg = config["num_neg"]
-        self.dense_dim = config["dense_dim"]
-
-        self.similarity_type = config["similarity_type"]
-        self.loss_type = config["loss_type"]
-        if self.similarity_type == "auto":
-            if self.loss_type == "softmax":
-                self.similarity_type = "inner"
-            elif self.loss_type == "margin":
-                self.similarity_type = "cosine"
-
-        self.ranking_length = config["ranking_length"]
-        self.mu_pos = config["mu_pos"]
-        self.mu_neg = config["mu_neg"]
-        self.use_max_sim_neg = config["use_max_sim_neg"]
-
-        self.scale_loss = config["scale_loss"]
-
-    def _load_regularization_params(self, config: Dict[Text, Any]) -> None:
-        self.C2 = config["C2"]
-        self.C_emb = config["C_emb"]
-        self.droprate = config["droprate"]
-
-    def _load_visual_params(self, config: Dict[Text, Any]) -> None:
-        self.evaluate_every_num_epochs = config["evaluate_every_num_epochs"]
-        if self.evaluate_every_num_epochs < 1:
-            self.evaluate_every_num_epochs = self.epochs
-        self.evaluate_on_num_examples = config["evaluate_on_num_examples"]
-
-    def _load_params(self) -> None:
-
-        self._check_old_config_variables(self.component_config)
-        self._tf_config = train_utils.load_tf_config(self.component_config)
-        self._load_nn_architecture_params(self.component_config)
-        self._load_embedding_params(self.component_config)
-        self._load_regularization_params(self.component_config)
-        self._load_visual_params(self.component_config)
-
-    # package safety checks
-    @classmethod
-    def required_packages(cls) -> List[Text]:
-        return ["tensorflow"]
 
     def __init__(
         self,
         component_config: Optional[Dict[Text, Any]] = None,
-        inverted_label_dict: Optional[Dict[int, Text]] = None,
-        session: Optional["tf.Session"] = None,
-        graph: Optional["tf.Graph"] = None,
-        batch_placeholder: Optional["tf.Tensor"] = None,
-        similarity_all: Optional["tf.Tensor"] = None,
-        pred_confidence: Optional["tf.Tensor"] = None,
-        similarity: Optional["tf.Tensor"] = None,
-        message_embed: Optional["tf.Tensor"] = None,
-        label_embed: Optional["tf.Tensor"] = None,
-        all_labels_embed: Optional["tf.Tensor"] = None,
-        batch_tuple_sizes: Optional[Dict] = None,
+        index_label_id_mapping: Optional[Dict[int, Text]] = None,
+        index_tag_id_mapping: Optional[Dict[int, Text]] = None,
+        model: Optional[RasaModel] = None,
     ) -> None:
-        """Declare instance variables with default values"""
-
-        super().__init__(component_config)
-
-        self._load_params()
-
-        # transform numbers to labels
-        self.inverted_label_dict = inverted_label_dict
-        # encode all label_ids with numbers
-        self._label_data = None
-
-        # tf related instances
-        self.session = session
-        self.graph = graph
-        self.batch_in = batch_placeholder
-        self.sim_all = similarity_all
-        self.pred_confidence = pred_confidence
-        self.sim = similarity
-
-        # persisted embeddings
-        self.message_embed = message_embed
-        self.label_embed = label_embed
-        self.all_labels_embed = all_labels_embed
-
-        # keep the input tuple sizes in self.batch_in
-        self.batch_tuple_sizes = batch_tuple_sizes
-
-        # internal tf instances
-        self._iterator = None
-        self._train_op = None
-        self._is_training = None
-
-    # training data helpers:
-    @staticmethod
-    def _create_label_id_dict(
-        training_data: "TrainingData", attribute: Text
-    ) -> Dict[Text, int]:
-        """Create label_id dictionary"""
-
-        distinct_label_ids = {
-            example.get(attribute) for example in training_data.intent_examples
-        } - {None}
-        return {
-            label_id: idx for idx, label_id in enumerate(sorted(distinct_label_ids))
-        }
-
-    @staticmethod
-    def _find_example_for_label(
-        label: Text, examples: List["Message"], attribute: Text
-    ) -> Optional["Message"]:
-        for ex in examples:
-            if ex.get(attribute) == label:
-                return ex
-        return None
-
-    @staticmethod
-    def _check_labels_features_exist(
-        labels_example: List["Message"], attribute: Text
-    ) -> bool:
-        """Check if all labels have features set"""
-
-        for label_example in labels_example:
-            if (
-                label_example.get(SPARSE_FEATURE_NAMES[attribute]) is None
-                and label_example.get(DENSE_FEATURE_NAMES[attribute]) is None
-            ):
-                return False
-        return True
-
-    @staticmethod
-    def _extract_and_add_features(
-        message: "Message", attribute: Text
-    ) -> Tuple[Optional[scipy.sparse.spmatrix], Optional[np.ndarray]]:
-        sparse_features = None
-        dense_features = None
-
-        if message.get(SPARSE_FEATURE_NAMES[attribute]) is not None:
-            sparse_features = message.get(SPARSE_FEATURE_NAMES[attribute])
-
-        if message.get(DENSE_FEATURE_NAMES[attribute]) is not None:
-            dense_features = message.get(DENSE_FEATURE_NAMES[attribute])
-
-        if sparse_features is not None and dense_features is not None:
-            if sparse_features.shape[0] != dense_features.shape[0]:
-                raise ValueError(
-                    f"Sequence dimensions for sparse and dense features "
-                    f"don't coincide in '{message.text}' for attribute '{attribute}'."
-                )
-
-        if attribute != INTENT_ATTRIBUTE:
-            # Use only the CLS token vector as features
-            sparse_features = sequence_to_sentence_features(sparse_features)
-            dense_features = sequence_to_sentence_features(dense_features)
-
-        return sparse_features, dense_features
-
-    def _extract_labels_precomputed_features(
-        self, label_examples: List["Message"], attribute: Text = INTENT_ATTRIBUTE
-    ) -> List[np.ndarray]:
-        """Collect precomputed encodings"""
-
-        sparse_features = []
-        dense_features = []
-
-        for e in label_examples:
-            _sparse, _dense = self._extract_and_add_features(e, attribute)
-            if _sparse is not None:
-                sparse_features.append(_sparse)
-            if _dense is not None:
-                dense_features.append(_dense)
-
-        sparse_features = np.array(sparse_features)
-        dense_features = np.array(dense_features)
-
-        return [sparse_features, dense_features]
 
-    @staticmethod
-    def _compute_default_label_features(
-        labels_example: List["Message"],
-    ) -> List[np.ndarray]:
-        """Compute one-hot representation for the labels"""
+        component_config = component_config or {}
 
-        return [
-            np.array(
-                [
-                    np.expand_dims(a, 0)
-                    for a in np.eye(len(labels_example), dtype=np.float32)
-                ]
-            )
-        ]
+        # the following properties cannot be adapted for the EmbeddingIntentClassifier
+        component_config[INTENT_CLASSIFICATION] = True
+        component_config[ENTITY_RECOGNITION] = False
+        component_config[MASKED_LM] = False
+        component_config[BILOU_FLAG] = False
+        component_config[NUM_TRANSFORMER_LAYERS] = 0
 
-    def _create_label_data(
-        self,
-        training_data: "TrainingData",
-        label_id_dict: Dict[Text, int],
-        attribute: Text,
-    ) -> "SessionDataType":
-        """Create matrix with label_ids encoded in rows as bag of words.
-
-        Find a training example for each label and get the encoded features
-        from the corresponding Message object.
-        If the features are already computed, fetch them from the message object
-        else compute a one hot encoding for the label as the feature vector.
-        """
-
-        # Collect one example for each label
-        labels_idx_example = []
-        for label_name, idx in label_id_dict.items():
-            label_example = self._find_example_for_label(
-                label_name, training_data.intent_examples, attribute
-            )
-            labels_idx_example.append((idx, label_example))
-
-        # Sort the list of tuples based on label_idx
-        labels_idx_example = sorted(labels_idx_example, key=lambda x: x[0])
-        labels_example = [example for (_, example) in labels_idx_example]
-
-        # Collect features, precomputed if they exist, else compute on the fly
-        if self._check_labels_features_exist(labels_example, attribute):
-            features = self._extract_labels_precomputed_features(
-                labels_example, attribute
-            )
-        else:
-            features = self._compute_default_label_features(labels_example)
-
-        label_data = {}
-        self._add_to_session_data(label_data, "label_features", features)
-        self._add_mask_to_session_data(label_data, "label_mask", "label_features")
-
-        return label_data
-
-    def _use_default_label_features(self, label_ids: np.ndarray) -> List[np.ndarray]:
-        return [
-            np.array(
-                [
-                    self._label_data["label_features"][0][label_id]
-                    for label_id in label_ids
-                ]
-            )
-        ]
-
-    # noinspection PyPep8Naming
-    def _create_session_data(
-        self,
-        training_data: List["Message"],
-        label_id_dict: Optional[Dict[Text, int]] = None,
-        label_attribute: Optional[Text] = INTENT_ATTRIBUTE,
-    ) -> "SessionDataType":
-        """Prepare data for training and create a SessionDataType object"""
-
-        X_sparse = []
-        X_dense = []
-        Y_sparse = []
-        Y_dense = []
-        label_ids = []
-
-        for e in training_data:
-            if e.get(label_attribute):
-                _sparse, _dense = self._extract_and_add_features(e, TEXT_ATTRIBUTE)
-                if _sparse is not None:
-                    X_sparse.append(_sparse)
-                if _dense is not None:
-                    X_dense.append(_dense)
-
-                _sparse, _dense = self._extract_and_add_features(e, label_attribute)
-                if _sparse is not None:
-                    Y_sparse.append(_sparse)
-                if _dense is not None:
-                    Y_dense.append(_dense)
-
-                if label_id_dict:
-                    label_ids.append(label_id_dict[e.get(label_attribute)])
-
-        X_sparse = np.array(X_sparse)
-        X_dense = np.array(X_dense)
-        Y_sparse = np.array(Y_sparse)
-        Y_dense = np.array(Y_dense)
-        label_ids = np.array(label_ids)
-
-        session_data = {}
-        self._add_to_session_data(session_data, "text_features", [X_sparse, X_dense])
-        self._add_to_session_data(session_data, "label_features", [Y_sparse, Y_dense])
-        # explicitly add last dimension to label_ids
-        # to track correctly dynamic sequences
-        self._add_to_session_data(
-            session_data, "label_ids", [np.expand_dims(label_ids, -1)]
-        )
-
-        if label_id_dict and (
-            "label_features" not in session_data or not session_data["label_features"]
-        ):
-            # no label features are present, get default features from _label_data
-            session_data["label_features"] = self._use_default_label_features(label_ids)
-
-        self._add_mask_to_session_data(session_data, "text_mask", "text_features")
-        self._add_mask_to_session_data(session_data, "label_mask", "label_features")
-
-        return session_data
-
-    @staticmethod
-    def _add_to_session_data(
-        session_data: SessionDataType, key: Text, features: List[np.ndarray]
-    ):
-        if not features:
-            return
-
-        session_data[key] = []
-
-        for data in features:
-            if data.size > 0:
-                session_data[key].append(data)
-
-    @staticmethod
-    def _add_mask_to_session_data(
-        session_data: SessionDataType, key: Text, from_key: Text
-    ):
-
-        session_data[key] = []
-
-        for data in session_data[from_key]:
-            if data.size > 0:
-                # explicitly add last dimension to mask
-                # to track correctly dynamic sequences
-                mask = np.array([np.ones((x.shape[0], 1)) for x in data])
-                session_data[key].append(mask)
-                break
-
-    # tf helpers:
-    def _create_tf_embed_fnn(
-        self,
-        x_in: "tf.Tensor",
-        layer_sizes: List[int],
-        fnn_name: Text,
-        embed_name: Text,
-    ) -> "tf.Tensor":
-        """Create nn with hidden layers and name"""
-
-        x = train_utils.create_tf_fnn(
-            x_in,
-            layer_sizes,
-            self.droprate,
-            self.C2,
-            self._is_training,
-            layer_name_suffix=fnn_name,
+        super().__init__(
+            component_config, index_label_id_mapping, index_tag_id_mapping, model
         )
-        return train_utils.create_tf_embed(
-            x,
-            self.embed_dim,
-            self.C2,
-            self.similarity_type,
-            layer_name_suffix=embed_name,
-        )
-
-    def _combine_sparse_dense_features(
-        self,
-        features: List[Union[tf.Tensor, tf.SparseTensor]],
-        mask: tf.Tensor,
-        name: Text,
-    ) -> tf.Tensor:
-        dense_features = []
-
-        dense_dim = self.dense_dim[name]
-        # if dense features are present use the feature dimension of the dense features
-        for f in features:
-            if not isinstance(f, tf.SparseTensor):
-                dense_dim = f.shape[-1]
-                break
-
-        for f in features:
-            if isinstance(f, tf.SparseTensor):
-                dense_features.append(
-                    train_utils.tf_dense_layer_for_sparse(f, dense_dim, name, self.C2)
-                )
-            else:
-                dense_features.append(f)
-
-        output = tf.concat(dense_features, axis=-1) * mask
-        # reduce dimensionality of output
-        output = tf.reduce_sum(output, axis=1)
-
-        return output
-
-    def _build_tf_train_graph(
-        self, session_data: SessionDataType
-    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
-
-        # get in tensors from generator
-        self.batch_in = self._iterator.get_next()
-        # convert encoded all labels into the batch format
-        label_batch = train_utils.prepare_batch(self._label_data)
 
-        # convert batch format into sparse and dense tensors
-        batch_data, _ = train_utils.batch_to_session_data(self.batch_in, session_data)
-        label_data, _ = train_utils.batch_to_session_data(label_batch, self._label_data)
-
-        a = self._combine_sparse_dense_features(
-            batch_data["text_features"], batch_data["text_mask"][0], "text"
-        )
-        b = self._combine_sparse_dense_features(
-            batch_data["label_features"], batch_data["label_mask"][0], "label"
-        )
-        all_bs = self._combine_sparse_dense_features(
-            label_data["label_features"], label_data["label_mask"][0], "label"
-        )
-
-        self.message_embed = self._create_tf_embed_fnn(
-            a,
-            self.hidden_layer_sizes["text"],
-            fnn_name="text_label" if self.share_hidden_layers else "text",
-            embed_name="text",
-        )
-        self.label_embed = self._create_tf_embed_fnn(
-            b,
-            self.hidden_layer_sizes["label"],
-            fnn_name="text_label" if self.share_hidden_layers else "label",
-            embed_name="label",
+        common_utils.raise_warning(
+            "'EmbeddingIntentClassifier' is deprecated and will be removed in version "
+            "2.0. Use 'DIETClassifier' instead.",
+            category=FutureWarning,
+            docs=DOCS_URL_MIGRATION_GUIDE,
         )
-        self.all_labels_embed = self._create_tf_embed_fnn(
-            all_bs,
-            self.hidden_layer_sizes["label"],
-            fnn_name="text_label" if self.share_hidden_layers else "label",
-            embed_name="label",
-        )
-
-        return train_utils.calculate_loss_acc(
-            self.message_embed,
-            self.label_embed,
-            b,
-            self.all_labels_embed,
-            all_bs,
-            self.num_neg,
-            None,
-            self.loss_type,
-            self.mu_pos,
-            self.mu_neg,
-            self.use_max_sim_neg,
-            self.C_emb,
-            self.scale_loss,
-        )
-
-    def _build_tf_pred_graph(self, session_data: "SessionDataType") -> "tf.Tensor":
-
-        shapes, types = train_utils.get_shapes_types(session_data)
-
-        batch_placeholder = []
-        for s, t in zip(shapes, types):
-            batch_placeholder.append(tf.placeholder(t, s))
-
-        self.batch_in = tf.tuple(batch_placeholder)
-
-        batch_data, self.batch_tuple_sizes = train_utils.batch_to_session_data(
-            self.batch_in, session_data
-        )
-
-        a = self._combine_sparse_dense_features(
-            batch_data["text_features"], batch_data["text_mask"][0], "text"
-        )
-        b = self._combine_sparse_dense_features(
-            batch_data["label_features"], batch_data["label_mask"][0], "label"
-        )
-
-        self.all_labels_embed = tf.constant(self.session.run(self.all_labels_embed))
-
-        self.message_embed = self._create_tf_embed_fnn(
-            a,
-            self.hidden_layer_sizes["text"],
-            fnn_name="text_label" if self.share_hidden_layers else "text",
-            embed_name="text",
-        )
-
-        self.sim_all = train_utils.tf_raw_sim(
-            self.message_embed[:, tf.newaxis, :],
-            self.all_labels_embed[tf.newaxis, :, :],
-            None,
-        )
-
-        self.label_embed = self._create_tf_embed_fnn(
-            b,
-            self.hidden_layer_sizes["label"],
-            fnn_name="text_label" if self.share_hidden_layers else "label",
-            embed_name="label",
-        )
-
-        self.sim = train_utils.tf_raw_sim(
-            self.message_embed[:, tf.newaxis, :], self.label_embed, None
-        )
-
-        return train_utils.confidence_from_sim(self.sim_all, self.similarity_type)
-
-    @staticmethod
-    def _get_num_of_features(session_data: "SessionDataType", key: Text) -> int:
-        num_features = 0
-        for data in session_data[key]:
-            if data.size > 0:
-                num_features += data[0].shape[-1]
-        return num_features
-
-    def check_input_dimension_consistency(self, session_data: "SessionDataType"):
-        """Check if text features and intent features have the same dimension."""
-
-        if self.share_hidden_layers:
-            num_text_features = self._get_num_of_features(session_data, "text_features")
-            num_intent_features = self._get_num_of_features(
-                session_data, "label_features"
-            )
-
-            if num_text_features != num_intent_features:
-                raise ValueError(
-                    "If embeddings are shared, "
-                    "text features and label features "
-                    "must coincide. Check the output dimensions of previous components."
-                )
-
-    def preprocess_train_data(self, training_data: "TrainingData"):
-        """Prepares data for training.
-
-        Performs sanity checks on training data, extracts encodings for labels.
-        """
-
-        label_id_dict = self._create_label_id_dict(
-            training_data, attribute=INTENT_ATTRIBUTE
-        )
-
-        self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
-
-        self._label_data = self._create_label_data(
-            training_data, label_id_dict, attribute=INTENT_ATTRIBUTE
-        )
-
-        session_data = self._create_session_data(
-            training_data.intent_examples,
-            label_id_dict,
-            label_attribute=INTENT_ATTRIBUTE,
-        )
-
-        self.check_input_dimension_consistency(session_data)
-
-        return session_data
-
-    @staticmethod
-    def _check_enough_labels(session_data: "SessionDataType") -> bool:
-        return len(np.unique(session_data["label_ids"])) >= 2
-
-    def train(
-        self,
-        training_data: "TrainingData",
-        cfg: Optional["RasaNLUModelConfig"] = None,
-        **kwargs: Any,
-    ) -> None:
-        """Train the embedding intent classifier on a data set."""
-
-        logger.debug("Started training embedding classifier.")
-
-        # set numpy random seed
-        np.random.seed(self.random_seed)
-
-        session_data = self.preprocess_train_data(training_data)
-
-        possible_to_train = self._check_enough_labels(session_data)
-
-        if not possible_to_train:
-            logger.error(
-                "Can not train a classifier. "
-                "Need at least 2 different classes. "
-                "Skipping training of classifier."
-            )
-            return
-
-        if self.evaluate_on_num_examples:
-            session_data, eval_session_data = train_utils.train_val_split(
-                session_data,
-                self.evaluate_on_num_examples,
-                self.random_seed,
-                label_key="label_ids",
-            )
-        else:
-            eval_session_data = None
-
-        self.graph = tf.Graph()
-        with self.graph.as_default():
-            # set random seed
-            tf.set_random_seed(self.random_seed)
-
-            # allows increasing batch size
-            batch_size_in = tf.placeholder(tf.int64)
-
-            (
-                self._iterator,
-                train_init_op,
-                eval_init_op,
-            ) = train_utils.create_iterator_init_datasets(
-                session_data,
-                eval_session_data,
-                batch_size_in,
-                self.batch_in_strategy,
-                label_key="label_ids",
-            )
-
-            self._is_training = tf.placeholder_with_default(False, shape=())
-
-            loss, acc = self._build_tf_train_graph(session_data)
-
-            # define which optimizer to use
-            self._train_op = tf.train.AdamOptimizer().minimize(loss)
-
-            # train tensorflow graph
-            self.session = tf.Session(config=self._tf_config)
-            train_utils.train_tf_dataset(
-                train_init_op,
-                eval_init_op,
-                batch_size_in,
-                loss,
-                acc,
-                self._train_op,
-                self.session,
-                self._is_training,
-                self.epochs,
-                self.batch_in_size,
-                self.evaluate_on_num_examples,
-                self.evaluate_every_num_epochs,
-            )
-
-            # rebuild the graph for prediction
-            self.pred_confidence = self._build_tf_pred_graph(session_data)
-
-    # process helpers
-    # noinspection PyPep8Naming
-    def _calculate_message_sim(
-        self, batch: Tuple[np.ndarray]
-    ) -> Tuple[np.ndarray, List[float]]:
-        """Calculate message similarities"""
-
-        message_sim = self.session.run(
-            self.pred_confidence,
-            feed_dict={
-                _x_in: _x for _x_in, _x in zip(self.batch_in, batch) if _x is not None
-            },
-        )
-
-        message_sim = message_sim.flatten()  # sim is a matrix
-
-        label_ids = message_sim.argsort()[::-1]
-
-        if self.loss_type == "softmax" and self.ranking_length > 0:
-            message_sim = train_utils.normalize(message_sim, self.ranking_length)
-
-        message_sim[::-1].sort()
-
-        # transform sim to python list for JSON serializing
-        return label_ids, message_sim.tolist()
-
-    @staticmethod
-    def _text_features_present(session_data: SessionDataType) -> bool:
-        return np.array(
-            [
-                f.nnz != 0 if isinstance(f, scipy.sparse.spmatrix) else f.any()
-                for features in session_data["text_features"]
-                for f in features
-            ]
-        ).any()
-
-    def predict_label(
-        self, message: "Message"
-    ) -> Tuple[Dict[Text, Any], List[Dict[Text, Any]]]:
-        """Predicts the intent of the provided message."""
-
-        label = {"name": None, "confidence": 0.0}
-        label_ranking = []
-
-        if self.session is None:
-            logger.error(
-                "There is no trained tf.session: "
-                "component is either not trained or "
-                "didn't receive enough training data."
-            )
-            return label, label_ranking
-
-        # create session data from message and convert it into a batch of 1
-        session_data = self._create_session_data([message])
-
-        # if no text-features are present (e.g. incoming message is not in the
-        # vocab), do not predict a random intent
-        if not self._text_features_present(session_data):
-            return label, label_ranking
-
-        batch = train_utils.prepare_batch(
-            session_data, tuple_sizes=self.batch_tuple_sizes
-        )
-
-        # load tf graph and session
-        label_ids, message_sim = self._calculate_message_sim(batch)
-
-        # if X contains all zeros do not predict some label
-        if label_ids.size > 0:
-            label = {
-                "name": self.inverted_label_dict[label_ids[0]],
-                "confidence": message_sim[0],
-            }
-
-            if self.ranking_length and 0 < self.ranking_length < LABEL_RANKING_LENGTH:
-                output_length = self.ranking_length
-            else:
-                output_length = LABEL_RANKING_LENGTH
-
-            ranking = list(zip(list(label_ids), message_sim))
-            ranking = ranking[:output_length]
-            label_ranking = [
-                {"name": self.inverted_label_dict[label_idx], "confidence": score}
-                for label_idx, score in ranking
-            ]
-
-        return label, label_ranking
-
-    def process(self, message: "Message", **kwargs: Any) -> None:
-        """Return the most likely label and its similarity to the input."""
-
-        label, label_ranking = self.predict_label(message)
-
-        message.set("intent", label, add_to_output=True)
-        message.set("intent_ranking", label_ranking, add_to_output=True)
-
-    def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
-        """Persist this model into the passed directory.
-
-        Return the metadata necessary to load the model again.
-        """
-
-        if self.session is None:
-            return {"file": None}
-
-        checkpoint = os.path.join(model_dir, file_name + ".ckpt")
-
-        try:
-            os.makedirs(os.path.dirname(checkpoint))
-        except OSError as e:
-            # be happy if someone already created the path
-            import errno
-
-            if e.errno != errno.EEXIST:
-                raise
-        with self.graph.as_default():
-            train_utils.persist_tensor("batch_placeholder", self.batch_in, self.graph)
-
-            train_utils.persist_tensor("similarity_all", self.sim_all, self.graph)
-            train_utils.persist_tensor(
-                "pred_confidence", self.pred_confidence, self.graph
-            )
-            train_utils.persist_tensor("similarity", self.sim, self.graph)
-
-            train_utils.persist_tensor("message_embed", self.message_embed, self.graph)
-            train_utils.persist_tensor("label_embed", self.label_embed, self.graph)
-            train_utils.persist_tensor(
-                "all_labels_embed", self.all_labels_embed, self.graph
-            )
-
-            saver = tf.train.Saver()
-            saver.save(self.session, checkpoint)
-
-        with open(
-            os.path.join(model_dir, file_name + ".inv_label_dict.pkl"), "wb"
-        ) as f:
-            pickle.dump(self.inverted_label_dict, f)
-
-        with open(os.path.join(model_dir, file_name + ".tf_config.pkl"), "wb") as f:
-            pickle.dump(self._tf_config, f)
-
-        with open(
-            os.path.join(model_dir, file_name + ".batch_tuple_sizes.pkl"), "wb"
-        ) as f:
-            pickle.dump(self.batch_tuple_sizes, f)
-
-        return {"file": file_name}
-
-    @classmethod
-    def load(
-        cls,
-        meta: Dict[Text, Any],
-        model_dir: Text = None,
-        model_metadata: "Metadata" = None,
-        cached_component: Optional["EmbeddingIntentClassifier"] = None,
-        **kwargs: Any,
-    ) -> "EmbeddingIntentClassifier":
-        """Loads the trained model from the provided directory."""
-
-        if model_dir and meta.get("file"):
-            file_name = meta.get("file")
-            checkpoint = os.path.join(model_dir, file_name + ".ckpt")
-
-            with open(os.path.join(model_dir, file_name + ".tf_config.pkl"), "rb") as f:
-                _tf_config = pickle.load(f)
-
-            graph = tf.Graph()
-            with graph.as_default():
-                session = tf.compat.v1.Session(config=_tf_config)
-                saver = tf.compat.v1.train.import_meta_graph(checkpoint + ".meta")
-
-                saver.restore(session, checkpoint)
-
-                batch_in = train_utils.load_tensor("batch_placeholder")
-
-                sim_all = train_utils.load_tensor("similarity_all")
-                pred_confidence = train_utils.load_tensor("pred_confidence")
-                sim = train_utils.load_tensor("similarity")
-
-                message_embed = train_utils.load_tensor("message_embed")
-                label_embed = train_utils.load_tensor("label_embed")
-                all_labels_embed = train_utils.load_tensor("all_labels_embed")
-
-            with open(
-                os.path.join(model_dir, file_name + ".inv_label_dict.pkl"), "rb"
-            ) as f:
-                inv_label_dict = pickle.load(f)
-
-            with open(
-                os.path.join(model_dir, file_name + ".batch_tuple_sizes.pkl"), "rb"
-            ) as f:
-                batch_tuple_sizes = pickle.load(f)
-
-            return cls(
-                component_config=meta,
-                inverted_label_dict=inv_label_dict,
-                session=session,
-                graph=graph,
-                batch_placeholder=batch_in,
-                similarity_all=sim_all,
-                pred_confidence=pred_confidence,
-                similarity=sim,
-                message_embed=message_embed,
-                label_embed=label_embed,
-                all_labels_embed=all_labels_embed,
-                batch_tuple_sizes=batch_tuple_sizes,
-            )
-
-        else:
-            raise_warning(
-                f"Failed to load nlu model. "
-                f"Maybe the path '{os.path.abspath(model_dir)}' doesn't exist?"
-            )
-            return cls(component_config=meta)
diff --git a/rasa/nlu/classifiers/keyword_intent_classifier.py b/rasa/nlu/classifiers/keyword_intent_classifier.py
index e08bf176370a..adebe6217243 100644
--- a/rasa/nlu/classifiers/keyword_intent_classifier.py
+++ b/rasa/nlu/classifiers/keyword_intent_classifier.py
@@ -1,26 +1,22 @@
 import os
 import logging
-import typing
 import re
 from typing import Any, Dict, Optional, Text
 
 from rasa.constants import DOCS_URL_COMPONENTS
 from rasa.nlu import utils
-from rasa.nlu.components import Component
-from rasa.nlu.training_data import Message
-from rasa.nlu.constants import INTENT_ATTRIBUTE
+from rasa.nlu.classifiers.classifier import IntentClassifier
+from rasa.nlu.constants import INTENT
 from rasa.utils.common import raise_warning
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.training_data import TrainingData
+from rasa.nlu.model import Metadata
+from rasa.nlu.training_data import Message
 
 logger = logging.getLogger(__name__)
 
-if typing.TYPE_CHECKING:
-    from rasa.nlu.config import RasaNLUModelConfig
-    from rasa.nlu.training_data import TrainingData
-    from rasa.nlu.model import Metadata
-    from rasa.nlu.training_data import Message
 
-
-class KeywordIntentClassifier(Component):
+class KeywordIntentClassifier(IntentClassifier):
     """Intent classifier using simple keyword matching.
 
 
@@ -29,8 +25,6 @@ class KeywordIntentClassifier(Component):
 
     """
 
-    provides = [INTENT_ATTRIBUTE]
-
     defaults = {"case_sensitive": True}
 
     def __init__(
@@ -46,8 +40,8 @@ def __init__(
 
     def train(
         self,
-        training_data: "TrainingData",
-        cfg: Optional["RasaNLUModelConfig"] = None,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
         **kwargs: Any,
     ) -> None:
 
@@ -55,19 +49,19 @@ def train(
         for ex in training_data.training_examples:
             if (
                 ex.text in self.intent_keyword_map.keys()
-                and ex.get(INTENT_ATTRIBUTE) != self.intent_keyword_map[ex.text]
+                and ex.get(INTENT) != self.intent_keyword_map[ex.text]
             ):
                 duplicate_examples.add(ex.text)
                 raise_warning(
                     f"Keyword '{ex.text}' is a keyword to trigger intent "
                     f"'{self.intent_keyword_map[ex.text]}' and also "
-                    f"intent '{ex.get(INTENT_ATTRIBUTE)}', it will be removed "
+                    f"intent '{ex.get(INTENT)}', it will be removed "
                     f"from the list of keywords for both of them. "
                     f"Remove (one of) the duplicates from the training data.",
                     docs=DOCS_URL_COMPONENTS + "#keyword-intent-classifier",
                 )
             else:
-                self.intent_keyword_map[ex.text] = ex.get(INTENT_ATTRIBUTE)
+                self.intent_keyword_map[ex.text] = ex.get(INTENT)
         for keyword in duplicate_examples:
             self.intent_keyword_map.pop(keyword)
             logger.debug(
@@ -110,8 +104,8 @@ def process(self, message: Message, **kwargs: Any) -> None:
         confidence = 0.0 if intent_name is None else 1.0
         intent = {"name": intent_name, "confidence": confidence}
 
-        if message.get(INTENT_ATTRIBUTE) is None or intent is not None:
-            message.set(INTENT_ATTRIBUTE, intent, add_to_output=True)
+        if message.get(INTENT) is None or intent is not None:
+            message.set(INTENT, intent, add_to_output=True)
 
     def _map_keyword_to_intent(self, text: Text) -> Optional[Text]:
         re_flag = 0 if self.case_sensitive else re.IGNORECASE
@@ -144,7 +138,7 @@ def load(
         cls,
         meta: Dict[Text, Any],
         model_dir: Optional[Text] = None,
-        model_metadata: "Metadata" = None,
+        model_metadata: Metadata = None,
         cached_component: Optional["KeywordIntentClassifier"] = None,
         **kwargs: Any,
     ) -> "KeywordIntentClassifier":
@@ -157,7 +151,7 @@ def load(
             else:
                 raise_warning(
                     f"Failed to load key word file for `IntentKeywordClassifier`, "
-                    f"maybe {keyword_file} does not exist?",
+                    f"maybe {keyword_file} does not exist?"
                 )
                 intent_keyword_map = None
             return cls(meta, intent_keyword_map)
diff --git a/rasa/nlu/classifiers/mitie_intent_classifier.py b/rasa/nlu/classifiers/mitie_intent_classifier.py
index cf156980b30c..5a971ab18b9a 100644
--- a/rasa/nlu/classifiers/mitie_intent_classifier.py
+++ b/rasa/nlu/classifiers/mitie_intent_classifier.py
@@ -1,22 +1,24 @@
 import os
 import typing
-from typing import Any, Dict, List, Optional, Text
+from typing import Any, Dict, List, Optional, Text, Type
 
+from rasa.nlu.utils.mitie_utils import MitieNLP
+from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from rasa.nlu.classifiers.classifier import IntentClassifier
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Metadata
-from rasa.nlu.constants import TOKENS_NAMES, TEXT_ATTRIBUTE, INTENT_ATTRIBUTE
+from rasa.nlu.constants import TOKENS_NAMES, TEXT, INTENT
 from rasa.nlu.training_data import Message, TrainingData
 
 if typing.TYPE_CHECKING:
     import mitie
 
 
-class MitieIntentClassifier(Component):
-
-    provides = [INTENT_ATTRIBUTE]
-
-    requires = [TOKENS_NAMES[TEXT_ATTRIBUTE], "mitie_feature_extractor", "mitie_file"]
+class MitieIntentClassifier(IntentClassifier):
+    @classmethod
+    def required_components(cls) -> List[Type[Component]]:
+        return [MitieNLP, Tokenizer]
 
     def __init__(
         self, component_config: Optional[Dict[Text, Any]] = None, clf=None
@@ -32,7 +34,10 @@ def required_packages(cls) -> List[Text]:
         return ["mitie"]
 
     def train(
-        self, training_data: TrainingData, cfg: RasaNLUModelConfig, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
     ) -> None:
         import mitie
 
@@ -49,7 +54,7 @@ def train(
 
         for example in training_data.intent_examples:
             tokens = self._tokens_of_message(example)
-            trainer.add_labeled_text(tokens, example.get(INTENT_ATTRIBUTE))
+            trainer.add_labeled_text(tokens, example.get(INTENT))
 
         if training_data.intent_examples:
             # we can not call train if there are no examples!
@@ -79,7 +84,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
     @staticmethod
     def _tokens_of_message(message) -> List[Text]:
-        tokens = [token.text for token in message.get(TOKENS_NAMES[TEXT_ATTRIBUTE], [])]
+        tokens = [token.text for token in message.get(TOKENS_NAMES[TEXT], [])]
         # return tokens without CLS token
         return tokens[:-1]
 
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index 70e6a6d0f276..47bd8940e907 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -2,20 +2,22 @@
 import os
 import typing
 import warnings
-from typing import Any, Dict, List, Optional, Text, Tuple
+from typing import Any, Dict, List, Optional, Text, Tuple, Type
 
 import numpy as np
 
-from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
-from rasa.nlu import utils
+import rasa.utils.io as io_utils
+from rasa.constants import DOCS_URL_TRAINING_DATA_NLU, DOCS_URL_MIGRATION_GUIDE
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
+from rasa.nlu.featurizers.featurizer import DenseFeaturizer
 from rasa.nlu.components import Component
+from rasa.nlu.classifiers.classifier import IntentClassifier
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.constants import DENSE_FEATURE_NAMES, TEXT_ATTRIBUTE
+from rasa.nlu.constants import DENSE_FEATURE_NAMES, TEXT
 from rasa.nlu.featurizers.featurizer import sequence_to_sentence_features
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.utils.common import raise_warning
+import rasa.utils.common as common_utils
 
 logger = logging.getLogger(__name__)
 
@@ -23,12 +25,12 @@
     import sklearn
 
 
-class SklearnIntentClassifier(Component):
+class SklearnIntentClassifier(IntentClassifier):
     """Intent classifier using the sklearn framework"""
 
-    provides = ["intent", "intent_ranking"]
-
-    requires = [DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]]
+    @classmethod
+    def required_components(cls) -> List[Type[Component]]:
+        return [DenseFeaturizer]
 
     defaults = {
         # C parameter of the svm - cross validation will select the best value
@@ -63,6 +65,13 @@ def __init__(
             self.le = LabelEncoder()
         self.clf = clf
 
+        common_utils.raise_warning(
+            "'SklearnIntentClassifier' is deprecated and will be removed in version "
+            "2.0. Use 'DIETClassifier' instead.",
+            category=FutureWarning,
+            docs=DOCS_URL_MIGRATION_GUIDE,
+        )
+
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["sklearn"]
@@ -82,7 +91,10 @@ def transform_labels_num2str(self, y: np.ndarray) -> np.ndarray:
         return self.le.inverse_transform(y)
 
     def train(
-        self, training_data: TrainingData, cfg: RasaNLUModelConfig, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
     ) -> None:
         """Train the intent classifier on a data set."""
 
@@ -91,7 +103,7 @@ def train(
         labels = [e.get("intent") for e in training_data.intent_examples]
 
         if len(set(labels)) < 2:
-            raise_warning(
+            common_utils.raise_warning(
                 "Can not train an intent classifier as there are not "
                 "enough intents. Need at least 2 different intents. "
                 "Skipping training of intent classifier.",
@@ -102,7 +114,7 @@ def train(
             X = np.stack(
                 [
                     sequence_to_sentence_features(
-                        example.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+                        example.get(DENSE_FEATURE_NAMES[TEXT])
                     )
                     for example in training_data.intent_examples
                 ]
@@ -162,7 +174,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
             intent_ranking = []
         else:
             X = sequence_to_sentence_features(
-                message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+                message.get(DENSE_FEATURE_NAMES[TEXT])
             ).reshape(1, -1)
             intent_ids, probabilities = self.predict(X)
             intents = self.transform_labels_num2str(np.ravel(intent_ids))
@@ -219,10 +231,10 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]
         classifier_file_name = file_name + "_classifier.pkl"
         encoder_file_name = file_name + "_encoder.pkl"
         if self.clf and self.le:
-            utils.json_pickle(
+            io_utils.json_pickle(
                 os.path.join(model_dir, encoder_file_name), self.le.classes_
             )
-            utils.json_pickle(
+            io_utils.json_pickle(
                 os.path.join(model_dir, classifier_file_name), self.clf.best_estimator_
             )
         return {"classifier": classifier_file_name, "encoder": encoder_file_name}
@@ -242,8 +254,8 @@ def load(
         encoder_file = os.path.join(model_dir, meta.get("encoder"))
 
         if os.path.exists(classifier_file):
-            classifier = utils.json_unpickle(classifier_file)
-            classes = utils.json_unpickle(encoder_file)
+            classifier = io_utils.json_unpickle(classifier_file)
+            classes = io_utils.json_unpickle(encoder_file)
             encoder = LabelEncoder()
             encoder.classes_ = classes
             return cls(meta, classifier, encoder)
diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index 25d2f6ddd1f4..52dc535c546e 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -1,9 +1,9 @@
 import logging
 import typing
-from typing import Any, Dict, Hashable, List, Optional, Set, Text, Tuple
+from typing import Any, Dict, Hashable, List, Optional, Set, Text, Tuple, Type
 
-from rasa.nlu.config import RasaNLUModelConfig, override_defaults
-from rasa.nlu.constants import RESPONSE_ATTRIBUTE
+from rasa.constants import DOCS_URL_MIGRATION_GUIDE
+from rasa.nlu.config import RasaNLUModelConfig, override_defaults, InvalidConfigError
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.utils.common import raise_warning
 
@@ -14,8 +14,15 @@
 
 
 def find_unavailable_packages(package_names: List[Text]) -> Set[Text]:
-    """Tries to import all the package names and returns
-    the packages where it failed."""
+    """Tries to import all package names and returns the packages where it failed.
+
+    Args:
+        package_names: The package names to import.
+
+    Returns:
+        Package names that could not be imported.
+    """
+
     import importlib
 
     failed_imports = set()
@@ -28,8 +35,12 @@ def find_unavailable_packages(package_names: List[Text]) -> Set[Text]:
 
 
 def validate_requirements(component_names: List[Text]) -> None:
-    """Ensures that all required importable python packages are installed to
-    instantiate and used the passed components."""
+    """Validates that all required importable python packages are installed.
+
+    Args:
+        component_names: The list of component names.
+    """
+
     from rasa.nlu import registry
 
     # Validate that all required packages are installed
@@ -51,17 +62,15 @@ def validate_requirements(component_names: List[Text]) -> None:
         )
 
 
-def validate_arguments(
-    pipeline: List["Component"],
-    context: Dict[Text, Any],
-    allow_empty_pipeline: bool = False,
-) -> None:
-    """Validates a pipeline before it is run. Ensures, that all
-    arguments are present to train the pipeline."""
+def validate_empty_pipeline(pipeline: List["Component"]) -> None:
+    """Ensures the pipeline is not empty.
 
-    # Ensure the pipeline is not empty
-    if not allow_empty_pipeline and len(pipeline) == 0:
-        raise ValueError(
+    Args:
+        pipeline: the list of the :class:`rasa.nlu.components.Component`.
+    """
+
+    if len(pipeline) == 0:
+        raise InvalidConfigError(
             "Can not train an empty pipeline. "
             "Make sure to specify a proper pipeline in "
             "the configuration using the 'pipeline' key. "
@@ -69,58 +78,123 @@ def validate_arguments(
             "NOT supported anymore."
         )
 
-    provided_properties = set(context.keys())
 
+def validate_only_one_tokenizer_is_used(pipeline: List["Component"]) -> None:
+    """Validates that only one tokenizer is present in the pipeline.
+
+    Args:
+        pipeline: the list of the :class:`rasa.nlu.components.Component`.
+    """
+
+    from rasa.nlu.tokenizers.tokenizer import Tokenizer
+
+    tokenizer_names = []
     for component in pipeline:
-        for r in component.requires:
-            if isinstance(r, Tuple):
-                validate_requires_any_of(r, provided_properties, str(component.name))
-            else:
-                if r not in provided_properties:
-                    raise Exception(
-                        f"Failed to validate component {component.name}. "
-                        f"Missing property: '{r}'"
-                    )
+        if isinstance(component, Tokenizer):
+            tokenizer_names.append(component.name)
 
-        provided_properties.update(component.provides)
+    if len(tokenizer_names) > 1:
+        raise InvalidConfigError(
+            f"More then one tokenizer is used: {tokenizer_names}. "
+            f"You can use only one tokenizer."
+        )
 
 
-def any_of(*args) -> Tuple[Any]:
-    """Helper function to define that one of the given arguments is required
-    by a component.
+def _required_component_in_pipeline(
+    required_component: Type["Component"], pipeline: List["Component"]
+) -> bool:
+    """Checks that required component present in the pipeline.
 
-    Should be used inside `requires`."""
-    return args
+    Args:
+        required_component: A class name of the required component.
+        pipeline: The list of the :class:`rasa.nlu.components.Component`.
 
+    Returns:
+        `True` if required_component is in the pipeline, `False` otherwise.
+    """
 
-def validate_requires_any_of(
-    required_properties: Tuple[Text],
-    provided_properties: Set[Text],
-    component_name: Text,
-) -> None:
-    """Validates that at least one of the given required properties is present in
-    the provided properties."""
+    for previous_component in pipeline:
+        if isinstance(previous_component, required_component):
+            return True
+    return False
 
-    property_present = any(
-        [property in provided_properties for property in required_properties]
-    )
 
-    if not property_present:
-        raise Exception(
-            f"Failed to validate component '{component_name}'. "
-            f"Missing one of the following properties: "
-            f"{required_properties}."
+def _check_deprecated_attributes(component: "Component") -> None:
+    """Checks that the component doesn't have deprecated attributes.
+
+    Args:
+        component: The :class:`rasa.nlu.components.Component`.
+    """
+
+    if hasattr(component, "provides"):
+        raise_warning(
+            f"'{component.name}' contains property 'provides', "
+            f"which is deprecated. There is no need to specify "
+            f"the list of attributes that a component provides.",
+            category=FutureWarning,
+            docs=DOCS_URL_MIGRATION_GUIDE,
+        )
+    if hasattr(component, "requires"):
+        raise_warning(
+            f"'{component.name}' contains property 'requires', "
+            f"which is deprecated. Use 'required_components()' method "
+            f"to specify which components are required to be present "
+            f"in the pipeline by this component.",
+            category=FutureWarning,
+            docs=DOCS_URL_MIGRATION_GUIDE,
         )
 
 
+def validate_required_components(pipeline: List["Component"]) -> None:
+    """Validates that all required components are present in the pipeline.
+
+    Args:
+        pipeline: The list of the :class:`rasa.nlu.components.Component`.
+    """
+
+    for i, component in enumerate(pipeline):
+        _check_deprecated_attributes(component)
+
+        missing_components = []
+        for required_component in component.required_components():
+            if not _required_component_in_pipeline(required_component, pipeline[:i]):
+                missing_components.append(required_component.name)
+
+        if missing_components:
+            raise InvalidConfigError(
+                f"'{component.name}' requires {missing_components}. "
+                f"Add required components to the pipeline."
+            )
+
+
+def validate_pipeline(pipeline: List["Component"]) -> None:
+    """Validates the pipeline.
+
+    Args:
+        pipeline: The list of the :class:`rasa.nlu.components.Component`.
+    """
+
+    validate_empty_pipeline(pipeline)
+    validate_only_one_tokenizer_is_used(pipeline)
+    validate_required_components(pipeline)
+
+
 def validate_required_components_from_data(
     pipeline: List["Component"], data: TrainingData
-):
+) -> None:
+    """Validates that all components are present in the pipeline based on data.
+
+    Args:
+        pipeline: The list of the :class:`rasa.nlu.components.Component`.
+        data: The :class:`rasa.nlu.training_data.training_data.TrainingData`.
+    """
+
+    from rasa.nlu.selectors.response_selector import ResponseSelector
 
     response_selector_exists = False
     for component in pipeline:
         # check if a response selector is part of NLU pipeline
-        if RESPONSE_ATTRIBUTE in component.provides:
+        if isinstance(component, ResponseSelector):
             response_selector_exists = True
 
     if len(data.response_examples) and not response_selector_exists:
@@ -131,8 +205,7 @@ def validate_required_components_from_data(
 
 
 class MissingArgumentError(ValueError):
-    """Raised when a function is called and not all parameters can be
-    filled from the context / config.
+    """Raised when not all parameters can be filled from the context / config.
 
     Attributes:
         message -- explanation of which parameter is missing
@@ -167,7 +240,7 @@ def __str__(self) -> Text:
 
 
 class ComponentMetaclass(type):
-    """Metaclass with `name` class property"""
+    """Metaclass with `name` class property."""
 
     @property
     def name(cls):
@@ -195,7 +268,8 @@ class Component(metaclass=ComponentMetaclass):
     components a component can use to do its own
     processing. For example, a featurizer component can provide
     features that are used by another component down
-    the pipeline to do intent classification."""
+    the pipeline to do intent classification.
+    """
 
     # Component class name is used when integrating it in a
     # pipeline. E.g. ``[ComponentA, ComponentB]``
@@ -207,21 +281,17 @@ def name(self):
 
         return type(self).name
 
-    # Defines what attributes the pipeline component will
-    # provide when called. The listed attributes
-    # should be set by the component on the message object
-    # during test and train, e.g.
-    # ```message.set("entities", [...])```
-    provides = []
-
-    # Which attributes on a message are required by this
-    # component. E.g. if requires contains "tokens", than a
-    # previous component in the pipeline needs to have "tokens"
-    # within the above described `provides` property.
-    # Use `any_of("option_1", "option_2")` to define that either
-    # "option_1" or "option_2" needs to be present in the
-    # provided properties from the previous components.
-    requires = []
+    # Which components are required by this component.
+    # Listed components should appear before the component itself in the pipeline.
+    @classmethod
+    def required_components(cls) -> List[Type["Component"]]:
+        """Specify which components need to be present in the pipeline.
+
+        Returns:
+            The list of class names of required components.
+        """
+
+        return []
 
     # Defines the default configuration parameters of a component
     # these values can be overwritten in the pipeline configuration
@@ -251,13 +321,19 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
 
     @classmethod
     def required_packages(cls) -> List[Text]:
-        """Specify which python packages need to be installed to use this
-        component, e.g. ``["spacy"]``. More specifically, these should be
+        """Specify which python packages need to be installed.
+
+        E.g. ``["spacy"]``. More specifically, these should be
         importable python package names e.g. `sklearn` and not package
         names in the dependencies sense e.g. `scikit-learn`
 
         This list of requirements allows us to fail early during training
-        if a required package is not installed."""
+        if a required package is not installed.
+
+        Returns:
+            The list of required package names.
+        """
+
         return []
 
     @classmethod
@@ -276,8 +352,18 @@ def load(
         this component needs to be able to restore itself.
         Components can rely on any context attributes that are
         created by :meth:`components.Component.create`
-        calls to components previous
-        to this one."""
+        calls to components previous to this one.
+
+        Args:
+            meta: Any configuration parameter related to the model.
+            model_dir: The directory to load the component from.
+            model_metadata: The model's :class:`rasa.nlu.model.Metadata`.
+            cached_component: The cached component.
+
+        Returns:
+            the loaded component
+        """
+
         if cached_component:
             return cached_component
         else:
@@ -289,7 +375,15 @@ def create(
     ) -> "Component":
         """Creates this component (e.g. before a training is started).
 
-        Method can access all configuration parameters."""
+        Method can access all configuration parameters.
+
+        Args:
+            component_config: The components configuration parameters.
+            config: The model configuration parameters.
+
+        Returns:
+            The created component.
+        """
 
         # Check language supporting
         language = config.language
@@ -300,7 +394,7 @@ def create(
         return cls(component_config)
 
     def provide_context(self) -> Optional[Dict[Text, Any]]:
-        """Initialize this component for a new pipeline
+        """Initialize this component for a new pipeline.
 
         This function will be called before the training
         is started and before the first message is processed using
@@ -310,11 +404,19 @@ def provide_context(self) -> Optional[Dict[Text, Any]]:
         components do not need to implement this method.
         It's mostly used to initialize framework environments
         like MITIE and spacy
-        (e.g. loading word vectors for the pipeline)."""
+        (e.g. loading word vectors for the pipeline).
+
+        Returns:
+            The updated component configuration.
+        """
+
         pass
 
     def train(
-        self, training_data: TrainingData, cfg: RasaNLUModelConfig, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
     ) -> None:
         """Train this component.
 
@@ -325,7 +427,15 @@ def train(
         of ANY component and
         on any context attributes created by a call to
         :meth:`rasa.nlu.components.Component.train`
-        of components previous to this one."""
+        of components previous to this one.
+
+        Args:
+            training_data:
+                The :class:`rasa.nlu.training_data.training_data.TrainingData`.
+            config: The model configuration parameters.
+
+        """
+
         pass
 
     def process(self, message: Message, **kwargs: Any) -> None:
@@ -338,11 +448,25 @@ def process(self, message: Message, **kwargs: Any) -> None:
         of ANY component and
         on any context attributes created by a call to
         :meth:`rasa.nlu.components.Component.process`
-        of components previous to this one."""
+        of components previous to this one.
+
+        Args:
+            message: The :class:`rasa.nlu.training_data.message.Message` to process.
+
+        """
+
         pass
 
     def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
-        """Persist this component to disk for future loading."""
+        """Persist this component to disk for future loading.
+
+        Args:
+            file_name: The file name of the model.
+            model_dir: The directory to store the model to.
+
+        Returns:
+            An optional dictionary with any information about the stored model.
+        """
 
         pass
 
@@ -355,7 +479,15 @@ def cache_key(
         If a component is unique to a model it should return None.
         Otherwise, an instantiation of the
         component will be reused for all models where the
-        metadata creates the same key."""
+        metadata creates the same key.
+
+        Args:
+            component_meta: The component configuration.
+            model_metadata: The component's :class:`rasa.nlu.model.Metadata`.
+
+        Returns:
+            A unique caching key.
+        """
 
         return None
 
@@ -379,7 +511,13 @@ def prepare_partial_processing(
         The pipeline should be a list of components that are
         previous to this one in the pipeline and
         have already finished their training (and can therefore
-        be safely used to process messages)."""
+        be safely used to process messages).
+
+        Args:
+            pipeline: The list of components.
+            context: The context of processing.
+
+        """
 
         self.partial_processing_pipeline = pipeline
         self.partial_processing_context = context
@@ -389,7 +527,15 @@ def partially_process(self, message: Message) -> Message:
         training (e.g. external training data).
 
         The passed message will be processed by all components
-        previous to this one in the pipeline."""
+        previous to this one in the pipeline.
+
+        Args:
+            message: The :class:`rasa.nlu.training_data.message.Message` to process.
+
+        Returns:
+            The processed :class:`rasa.nlu.training_data.message.Message`.
+
+        """
 
         if self.partial_processing_context is not None:
             for component in self.partial_processing_pipeline:
@@ -403,7 +549,14 @@ def can_handle_language(cls, language: Hashable) -> bool:
         """Check if component supports a specific language.
 
         This method can be overwritten when needed. (e.g. dynamically
-        determine which language is supported.)"""
+        determine which language is supported.)
+
+        Args:
+            language: The language to check.
+
+        Returns:
+            `True` if component can handle specific language, `False` otherwise.
+        """
 
         # if language_list is set to `None` it means: support all languages
         if language is None or cls.language_list is None:
@@ -463,19 +616,21 @@ def load_component(
         model_metadata: "Metadata",
         **context: Any,
     ) -> Component:
-        """Tries to retrieve a component from the cache, else calls
+        """Loads a component.
+
+        Tries to retrieve a component from the cache, else calls
         ``load`` to create a new component.
 
         Args:
             component_meta:
-                the metadata of the component to load in the pipeline
+                The metadata of the component to load in the pipeline.
             model_dir:
-                the directory to read the model from
+                The directory to read the model from.
             model_metadata (Metadata):
-                the model's :class:`rasa.nlu.model.Metadata`
+                The model's :class:`rasa.nlu.model.Metadata`.
 
         Returns:
-            Component: the loaded component.
+            The loaded component.
         """
 
         from rasa.nlu import registry
@@ -501,8 +656,19 @@ def load_component(
     def create_component(
         self, component_config: Dict[Text, Any], cfg: RasaNLUModelConfig
     ) -> Component:
-        """Tries to retrieve a component from the cache,
-        calls `create` to create a new component."""
+        """Creates a component.
+
+        Tries to retrieve a component from the cache,
+        calls `create` to create a new component.
+
+        Args:
+            component_config: The component configuration.
+            cfg: The model configuration.
+
+        Returns:
+            The created component.
+        """
+
         from rasa.nlu import registry
         from rasa.nlu.model import Metadata
 
diff --git a/rasa/nlu/config.py b/rasa/nlu/config.py
index a39d5632466a..00bfdf12a56b 100644
--- a/rasa/nlu/config.py
+++ b/rasa/nlu/config.py
@@ -2,12 +2,16 @@
 import logging
 import os
 import ruamel.yaml as yaml
-from typing import Any, Dict, List, Optional, Text, Union, Tuple
+from typing import Any, Dict, List, Optional, Text, Union
 
 import rasa.utils.io
-from rasa.constants import DEFAULT_CONFIG_PATH, DOCS_URL_PIPELINE
+from rasa.constants import (
+    DEFAULT_CONFIG_PATH,
+    DOCS_URL_PIPELINE,
+    DOCS_URL_MIGRATION_GUIDE,
+)
 from rasa.nlu.utils import json_to_string
-from rasa.utils.common import raise_warning
+import rasa.utils.common as common_utils
 
 logger = logging.getLogger(__name__)
 
@@ -68,7 +72,7 @@ def component_config_from_pipeline(
         c = pipeline[index]
         return override_defaults(defaults, c)
     except IndexError:
-        raise_warning(
+        common_utils.raise_warning(
             f"Tried to get configuration value for component "
             f"number {index} which is not part of your pipeline. "
             f"Returning `defaults`.",
@@ -103,7 +107,7 @@ def __init__(self, configuration_values: Optional[Dict[Text, Any]] = None) -> No
                 "tensorflow_embedding": "supervised_embeddings",
             }
             if template_name in new_names:
-                raise_warning(
+                common_utils.raise_warning(
                     f"You have specified the pipeline template "
                     f"'{template_name}' which has been renamed to "
                     f"'{new_names[template_name]}'. "
@@ -118,6 +122,15 @@ def __init__(self, configuration_values: Optional[Dict[Text, Any]] = None) -> No
             pipeline = registry.pipeline_template(template_name)
 
             if pipeline:
+                common_utils.raise_warning(
+                    "You are using a pipeline template. All pipelines templates "
+                    "are deprecated and will be removed in version 2.0. Please add "
+                    "the components you want to use directly to your configuration "
+                    "file.",
+                    FutureWarning,
+                    docs=DOCS_URL_MIGRATION_GUIDE,
+                )
+
                 # replaces the template with the actual components
                 self.__dict__["pipeline"] = pipeline
             else:
@@ -181,7 +194,7 @@ def set_component_attr(self, index, **kwargs) -> None:
         try:
             self.pipeline[index].update(kwargs)
         except IndexError:
-            raise_warning(
+            common_utils.raise_warning(
                 f"Tried to set configuration value for component "
                 f"number {index} which is not part of the pipeline.",
                 docs=DOCS_URL_PIPELINE,
diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py
index 17139a7ba2ef..bc2334aa7fc1 100644
--- a/rasa/nlu/constants.py
+++ b/rasa/nlu/constants.py
@@ -1,42 +1,51 @@
-TEXT_ATTRIBUTE = "text"
+TEXT = "text"
 
 RESPONSE_KEY_ATTRIBUTE = "response_key"
 
-INTENT_ATTRIBUTE = "intent"
+INTENT = "intent"
 
-RESPONSE_ATTRIBUTE = "response"
+RESPONSE = "response"
 
-ENTITIES_ATTRIBUTE = "entities"
+ENTITIES = "entities"
+BILOU_ENTITIES = "bilou_entities"
+NO_ENTITY_TAG = "O"
 
-EXTRACTOR_ATTRIBUTE = "extractor"
+EXTRACTOR = "extractor"
 
 PRETRAINED_EXTRACTORS = {"DucklingHTTPExtractor", "SpacyEntityExtractor"}
 
 CLS_TOKEN = "__CLS__"
 
-MESSAGE_ATTRIBUTES = [TEXT_ATTRIBUTE, INTENT_ATTRIBUTE, RESPONSE_ATTRIBUTE]
+MESSAGE_ATTRIBUTES = [TEXT, INTENT, RESPONSE]
 
-TOKENS_NAMES = {
-    TEXT_ATTRIBUTE: "tokens",
-    INTENT_ATTRIBUTE: "intent_tokens",
-    RESPONSE_ATTRIBUTE: "response_tokens",
-}
+TOKENS_NAMES = {TEXT: "tokens", INTENT: "intent_tokens", RESPONSE: "response_tokens"}
 
 SPARSE_FEATURE_NAMES = {
-    TEXT_ATTRIBUTE: "text_sparse_features",
-    INTENT_ATTRIBUTE: "intent_sparse_features",
-    RESPONSE_ATTRIBUTE: "response_sparse_features",
+    TEXT: "text_sparse_features",
+    INTENT: "intent_sparse_features",
+    RESPONSE: "response_sparse_features",
 }
 
 DENSE_FEATURE_NAMES = {
-    TEXT_ATTRIBUTE: "text_dense_features",
-    INTENT_ATTRIBUTE: "intent_dense_features",
-    RESPONSE_ATTRIBUTE: "response_dense_features",
+    TEXT: "text_dense_features",
+    INTENT: "intent_dense_features",
+    RESPONSE: "response_dense_features",
+}
+
+LANGUAGE_MODEL_DOCS = {
+    TEXT: "text_language_model_doc",
+    RESPONSE: "response_language_model_doc",
 }
 
-SPACY_DOCS = {TEXT_ATTRIBUTE: "spacy_doc", RESPONSE_ATTRIBUTE: "response_spacy_doc"}
+TOKEN_IDS = "token_ids"
+TOKENS = "tokens"
+SEQUENCE_FEATURES = "sequence_features"
+SENTENCE_FEATURES = "sentence_features"
+
+SPACY_DOCS = {TEXT: "text_spacy_doc", RESPONSE: "response_spacy_doc"}
+
 
-DENSE_FEATURIZABLE_ATTRIBUTES = [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]
+DENSE_FEATURIZABLE_ATTRIBUTES = [TEXT, RESPONSE]
 
 RESPONSE_SELECTOR_PROPERTY_NAME = "response_selector"
 DEFAULT_OPEN_UTTERANCE_TYPE = "default"
diff --git a/rasa/nlu/extractors/__init__.py b/rasa/nlu/extractors/__init__.py
index c2e01d764ec1..e69de29bb2d1 100644
--- a/rasa/nlu/extractors/__init__.py
+++ b/rasa/nlu/extractors/__init__.py
@@ -1,90 +0,0 @@
-from typing import Any, Dict, List, Text, Tuple
-
-from rasa.nlu.components import Component
-from rasa.nlu.constants import EXTRACTOR_ATTRIBUTE, ENTITIES_ATTRIBUTE
-from rasa.nlu.training_data import Message
-
-
-class EntityExtractor(Component):
-    def add_extractor_name(
-        self, entities: List[Dict[Text, Any]]
-    ) -> List[Dict[Text, Any]]:
-        for entity in entities:
-            entity[EXTRACTOR_ATTRIBUTE] = self.name
-        return entities
-
-    def add_processor_name(self, entity: Dict[Text, Any]) -> Dict[Text, Any]:
-        if "processors" in entity:
-            entity["processors"].append(self.name)
-        else:
-            entity["processors"] = [self.name]
-
-        return entity
-
-    @staticmethod
-    def filter_irrelevant_entities(extracted: list, requested_dimensions: set) -> list:
-        """Only return dimensions the user configured"""
-
-        if requested_dimensions:
-            return [
-                entity
-                for entity in extracted
-                if entity["entity"] in requested_dimensions
-            ]
-        else:
-            return extracted
-
-    @staticmethod
-    def find_entity(ent, text, tokens) -> Tuple[int, int]:
-        offsets = [token.start for token in tokens]
-        ends = [token.end for token in tokens]
-
-        if ent["start"] not in offsets:
-            message = (
-                "Invalid entity {} in example '{}': "
-                "entities must span whole tokens. "
-                "Wrong entity start.".format(ent, text)
-            )
-            raise ValueError(message)
-
-        if ent["end"] not in ends:
-            message = (
-                "Invalid entity {} in example '{}': "
-                "entities must span whole tokens. "
-                "Wrong entity end.".format(ent, text)
-            )
-            raise ValueError(message)
-
-        start = offsets.index(ent["start"])
-        end = ends.index(ent["end"]) + 1
-        return start, end
-
-    def filter_trainable_entities(
-        self, entity_examples: List[Message]
-    ) -> List[Message]:
-        """Filters out untrainable entity annotations.
-
-        Creates a copy of entity_examples in which entities that have
-        `extractor` set to something other than
-        self.name (e.g. 'CRFEntityExtractor') are removed.
-        """
-
-        filtered = []
-        for message in entity_examples:
-            entities = []
-            for ent in message.get(ENTITIES_ATTRIBUTE, []):
-                extractor = ent.get(EXTRACTOR_ATTRIBUTE)
-                if not extractor or extractor == self.name:
-                    entities.append(ent)
-            data = message.data.copy()
-            data[ENTITIES_ATTRIBUTE] = entities
-            filtered.append(
-                Message(
-                    text=message.text,
-                    data=data,
-                    output_properties=message.output_properties,
-                    time=message.time,
-                )
-            )
-
-        return filtered
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index f937dd443cd8..241e68dbf820 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -2,37 +2,35 @@
 import os
 import typing
 import numpy as np
-from typing import Any, Dict, List, Optional, Text, Tuple, Union, NamedTuple
-
-from rasa.nlu.config import InvalidConfigError, RasaNLUModelConfig
-from rasa.nlu.extractors import EntityExtractor
+from typing import Any, Dict, List, Optional, Text, Tuple, Union, NamedTuple, Type
+
+import rasa.nlu.utils.bilou_utils as bilou_utils
+import rasa.utils.common as common_utils
+from rasa.nlu.tokenizers.spacy_tokenizer import POS_TAG_KEY
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from rasa.nlu.components import Component
+from rasa.nlu.extractors.extractor import EntityExtractor
 from rasa.nlu.model import Metadata
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
     TOKENS_NAMES,
-    TEXT_ATTRIBUTE,
+    TEXT,
     DENSE_FEATURE_NAMES,
-    SPACY_DOCS,
-    ENTITIES_ATTRIBUTE,
+    ENTITIES,
+    NO_ENTITY_TAG,
 )
 from rasa.constants import (
-    DOCS_BASE_URL,
     DOCS_URL_TRAINING_DATA_NLU,
     DOCS_URL_COMPONENTS,
+    DOCS_URL_MIGRATION_GUIDE,
 )
-from rasa.utils.common import raise_warning
-
-try:
-    import spacy
-except ImportError:
-    spacy = None
 
 logger = logging.getLogger(__name__)
 
 if typing.TYPE_CHECKING:
     from sklearn_crfsuite import CRF
-    from spacy.tokens import Doc
 
 
 class CRFToken(NamedTuple):
@@ -44,27 +42,26 @@ class CRFToken(NamedTuple):
 
 
 class CRFEntityExtractor(EntityExtractor):
-
-    provides = [ENTITIES_ATTRIBUTE]
-
-    requires = [TOKENS_NAMES[TEXT_ATTRIBUTE]]
+    @classmethod
+    def required_components(cls) -> List[Type[Component]]:
+        return [Tokenizer]
 
     defaults = {
         # BILOU_flag determines whether to use BILOU tagging or not.
         # More rigorous however requires more examples per entity
         # rule of thumb: use only if more than 100 egs. per entity
         "BILOU_flag": True,
-        # crf_features is [before, word, after] array with before, word,
-        # after holding keys about which
-        # features to use for each word, for example, 'title' in
-        # array before will have the feature
-        # "is the preceding word in title case?"
-        # POS features require spaCy to be installed
+        # crf_features is [before, token, after] array with before, token,
+        # after holding keys about which features to use for each token,
+        # for example, 'title' in array before will have the feature
+        # "is the preceding token in title case?"
+        # POS features require SpacyTokenizer
+        # pattern feature require RegexFeaturizer
         "features": [
             ["low", "title", "upper"],
             [
-                "bias",
                 "low",
+                "bias",
                 "prefix5",
                 "prefix2",
                 "suffix5",
@@ -79,26 +76,28 @@ class CRFEntityExtractor(EntityExtractor):
         ],
         # The maximum number of iterations for optimization algorithms.
         "max_iterations": 50,
-        # weight of theL1 regularization
+        # weight of the L1 regularization
         "L1_c": 0.1,
         # weight of the L2 regularization
         "L2_c": 0.1,
     }
 
     function_dict = {
-        "low": lambda crf_token: crf_token.text.lower(),  # pytype: disable=attribute-error
-        "title": lambda crf_token: crf_token.text.istitle(),  # pytype: disable=attribute-error
+        "low": lambda crf_token: crf_token.text.lower(),
+        "title": lambda crf_token: crf_token.text.istitle(),
         "prefix5": lambda crf_token: crf_token.text[:5],
         "prefix2": lambda crf_token: crf_token.text[:2],
         "suffix5": lambda crf_token: crf_token.text[-5:],
         "suffix3": lambda crf_token: crf_token.text[-3:],
         "suffix2": lambda crf_token: crf_token.text[-2:],
         "suffix1": lambda crf_token: crf_token.text[-1:],
-        "pos": lambda crf_token: crf_token.tag,
-        "pos2": lambda crf_token: crf_token.tag[:2],
         "bias": lambda crf_token: "bias",
-        "upper": lambda crf_token: crf_token.text.isupper(),  # pytype: disable=attribute-error
-        "digit": lambda crf_token: crf_token.text.isdigit(),  # pytype: disable=attribute-error
+        "pos": lambda crf_token: crf_token.tag,
+        "pos2": lambda crf_token: crf_token.tag[:2]
+        if crf_token.tag is not None
+        else None,
+        "upper": lambda crf_token: crf_token.text.isupper(),
+        "digit": lambda crf_token: crf_token.text.isdigit(),
         "pattern": lambda crf_token: crf_token.pattern,
         "text_dense_features": lambda crf_token: crf_token.dense_features,
     }
@@ -115,26 +114,11 @@ def __init__(
 
         self._validate_configuration()
 
-        self._check_pos_features_and_spacy()
-
-    def _check_pos_features_and_spacy(self) -> None:
-        import itertools
-
-        features = self.component_config.get("features", [])
-        fts = set(itertools.chain.from_iterable(features))
-        self.pos_features = "pos" in fts or "pos2" in fts
-        if self.pos_features:
-            self._check_spacy()
-
-    @staticmethod
-    def _check_spacy() -> None:
-        if spacy is None:
-            raise ImportError(
-                "Failed to import `spaCy`. "
-                "`spaCy` is required for POS features "
-                "See https://spacy.io/usage/ for installation"
-                "instructions."
-            )
+        common_utils.raise_warning(
+            "'CRFEntityExtractor' is deprecated and will be removed in version "
+            "2.0. Use 'DIETClassifier' instead.",
+            docs=DOCS_URL_MIGRATION_GUIDE,
+        )
 
     def _validate_configuration(self) -> None:
         if len(self.component_config.get("features", [])) % 2 != 1:
@@ -147,14 +131,15 @@ def required_packages(cls) -> List[Text]:
         return ["sklearn_crfsuite", "sklearn"]
 
     def train(
-        self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
     ) -> None:
 
         # checks whether there is at least one
         # example with an entity annotation
         if training_data.entity_examples:
-            self._check_spacy_doc(training_data.training_examples[0])
-
             # filter out pre-trained entity examples
             filtered_entity_examples = self.filter_trainable_entities(
                 training_data.training_examples
@@ -171,39 +156,14 @@ def _create_dataset(self, examples: List[Message]) -> List[List[CRFToken]]:
         dataset = []
 
         for example in examples:
-            entity_offsets = self._convert_example(example)
+            entity_offsets = bilou_utils.map_message_entities(example)
             dataset.append(self._from_json_to_crf(example, entity_offsets))
 
         return dataset
 
-    def _check_spacy_doc(self, message: Message) -> None:
-        if self.pos_features and message.get(SPACY_DOCS[TEXT_ATTRIBUTE]) is None:
-            raise InvalidConfigError(
-                "Could not find `spacy_doc` attribute for "
-                "message {}\n"
-                "POS features require a pipeline component "
-                "that provides `spacy_doc` attributes, i.e. `SpacyNLP`. "
-                "See {}/nlu/choosing-a-pipeline/#pretrained-embeddings-spacy "
-                "for details".format(message.text, DOCS_BASE_URL)
-            )
-
     def process(self, message: Message, **kwargs: Any) -> None:
-
-        self._check_spacy_doc(message)
-
         extracted = self.add_extractor_name(self.extract_entities(message))
-        message.set(
-            ENTITIES_ATTRIBUTE,
-            message.get(ENTITIES_ATTRIBUTE, []) + extracted,
-            add_to_output=True,
-        )
-
-    @staticmethod
-    def _convert_example(example: Message) -> List[Tuple[int, int, Text]]:
-        def convert_entity(entity):
-            return entity["start"], entity["end"], entity["entity"]
-
-        return [convert_entity(ent) for ent in example.get(ENTITIES_ATTRIBUTE, [])]
+        message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True)
 
     def extract_entities(self, message: Message) -> List[Dict[Text, Any]]:
         """Take a sentence and return entities in json format"""
@@ -237,29 +197,25 @@ def most_likely_entity(self, idx: int, entities: List[Any]) -> Tuple[Text, Any]:
         else:
             return "", 0.0
 
+    @staticmethod
     def _create_entity_dict(
-        self,
         message: Message,
-        tokens: Union["Doc", List[Token]],
+        tokens: List[Token],
         start: int,
         end: int,
         entity: str,
         confidence: float,
     ) -> Dict[Text, Any]:
-        if isinstance(tokens, list):  # tokens is a list of Token
-            _start = tokens[start].start
-            _end = tokens[end].end
-            value = tokens[start].text
-            value += "".join(
-                [
-                    message.text[tokens[i - 1].end : tokens[i].start] + tokens[i].text
-                    for i in range(start + 1, end + 1)
-                ]
-            )
-        else:  # tokens is a Doc
-            _start = tokens[start].idx
-            _end = tokens[start : end + 1].end_char
-            value = tokens[start : end + 1].text
+
+        _start = tokens[start].start
+        _end = tokens[end].end
+        value = tokens[start].text
+        value += "".join(
+            [
+                message.text[tokens[i - 1].end : tokens[i].start] + tokens[i].text
+                for i in range(start + 1, end + 1)
+            ]
+        )
 
         return {
             "start": _start,
@@ -269,20 +225,10 @@ def _create_entity_dict(
             "confidence": confidence,
         }
 
-    @staticmethod
-    def _entity_from_label(label) -> Text:
-        return label[2:]
-
-    @staticmethod
-    def _bilou_from_label(label) -> Optional[Text]:
-        if len(label) >= 2 and label[1] == "-":
-            return label[0].upper()
-        return None
-
     @staticmethod
     def _tokens_without_cls(message: Message) -> List[Token]:
         # [:-1] to remove the CLS token from the list of tokens
-        return message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])[:-1]
+        return message.get(TOKENS_NAMES[TEXT])[:-1]
 
     def _find_bilou_end(self, word_idx, entities) -> Any:
         ent_word_idx = word_idx + 1
@@ -290,7 +236,7 @@ def _find_bilou_end(self, word_idx, entities) -> Any:
 
         # get information about the first word, tagged with `B-...`
         label, confidence = self.most_likely_entity(word_idx, entities)
-        entity_label = self._entity_from_label(label)
+        entity_label = bilou_utils.entity_name_from_tag(label)
 
         while not finished:
             label, label_confidence = self.most_likely_entity(ent_word_idx, entities)
@@ -327,12 +273,12 @@ def _handle_bilou_label(
         self, word_idx: int, entities: List[Any]
     ) -> Tuple[Any, Any, Any]:
         label, confidence = self.most_likely_entity(word_idx, entities)
-        entity_label = self._entity_from_label(label)
+        entity_label = bilou_utils.entity_name_from_tag(label)
 
-        if self._bilou_from_label(label) == "U":
+        if bilou_utils.bilou_prefix_from_tag(label) == "U":
             return word_idx, confidence, entity_label
 
-        elif self._bilou_from_label(label) == "B":
+        elif bilou_utils.bilou_prefix_from_tag(label) == "B":
             # start of multi word-entity need to represent whole extent
             ent_word_idx, confidence = self._find_bilou_end(word_idx, entities)
             return ent_word_idx, confidence, entity_label
@@ -344,10 +290,7 @@ def _from_crf_to_json(
         self, message: Message, entities: List[Any]
     ) -> List[Dict[Text, Any]]:
 
-        if self.pos_features:
-            tokens = message.get(SPACY_DOCS[TEXT_ATTRIBUTE])
-        else:
-            tokens = self._tokens_without_cls(message)
+        tokens = self._tokens_without_cls(message)
 
         if len(tokens) != len(entities):
             raise Exception(
@@ -391,16 +334,10 @@ def _convert_simple_tagging_to_entity_result(
         for word_idx in range(len(tokens)):
             entity_label, confidence = self.most_likely_entity(word_idx, entities)
             word = tokens[word_idx]
-            if entity_label != "O":
-                if self.pos_features and not isinstance(word, Token):
-                    start = word.idx
-                    end = word.idx + len(word)
-                else:
-                    start = word.start
-                    end = word.end
+            if entity_label != NO_ENTITY_TAG:
                 ent = {
-                    "start": start,
-                    "end": end,
+                    "start": word.start,
+                    "end": word.end,
                     "value": word.text,
                     "entity": entity_label,
                     "confidence": confidence,
@@ -457,6 +394,7 @@ def _sentence_to_features(self, sentence: List[CRFToken]) -> List[Dict[Text, Any
             feature_range = range(-half_span, half_span + 1)
             prefixes = [str(i) for i in feature_range]
             word_features = {}
+
             for f_i in feature_range:
                 if word_idx + f_i >= len(sentence):
                     word_features["EOS"] = True
@@ -478,10 +416,14 @@ def _sentence_to_features(self, sentence: List[CRFToken]) -> List[Dict[Text, Any
                                 feature_name = prefix + ":" + feature + ":" + p_name
                                 word_features[feature_name] = matched
                             # pytype: enable=attribute-error
+                        elif word and (feature == "pos" or feature == "pos2"):
+                            value = self.function_dict[feature](word)
+                            word_features[f"{prefix}:{feature}"] = value
                         else:
                             # append each feature to a feature vector
                             value = self.function_dict[feature](word)
                             word_features[prefix + ":" + feature] = value
+
             sentence_features.append(word_features)
         return sentence_features
 
@@ -505,24 +447,17 @@ def _from_json_to_crf(
     ) -> List[CRFToken]:
         """Convert json examples to format of underlying crfsuite."""
 
-        if self.pos_features:
-            from spacy.gold import GoldParse  # pytype: disable=import-error
-
-            doc_or_tokens = message.get(SPACY_DOCS[TEXT_ATTRIBUTE])
-            gold = GoldParse(doc_or_tokens, entities=entity_offsets)
-            ents = [l[5] for l in gold.orig_annot]
-        else:
-            doc_or_tokens = self._tokens_without_cls(message)
-            ents = self._bilou_tags_from_offsets(doc_or_tokens, entity_offsets)
+        tokens = self._tokens_without_cls(message)
+        ents = bilou_utils.bilou_tags_from_offsets(tokens, entity_offsets)
 
         # collect badly annotated examples
         collected = []
-        for t, e in zip(doc_or_tokens, ents):
+        for t, e in zip(tokens, ents):
             if e == "-":
                 collected.append(t)
             elif collected:
                 collected_text = " ".join([t.text for t in collected])
-                raise_warning(
+                common_utils.raise_warning(
                     f"Misaligned entity annotation for '{collected_text}' "
                     f"in sentence '{message.text}' with intent "
                     f"'{message.get('intent')}'. "
@@ -536,72 +471,34 @@ def _from_json_to_crf(
 
         if not self.component_config["BILOU_flag"]:
             for i, label in enumerate(ents):
-                if self._bilou_from_label(label) in {"B", "I", "U", "L"}:
+                if bilou_utils.bilou_prefix_from_tag(label) in {"B", "I", "U", "L"}:
                     # removes BILOU prefix from label
-                    ents[i] = self._entity_from_label(label)
+                    ents[i] = bilou_utils.entity_name_from_tag(label)
 
         return self._from_text_to_crf(message, ents)
 
-    @staticmethod
-    def _bilou_tags_from_offsets(tokens, entities, missing: Text = "O") -> List[Text]:
-        # From spacy.spacy.GoldParse, under MIT License
-        starts = {token.start: i for i, token in enumerate(tokens)}
-        ends = {token.end: i for i, token in enumerate(tokens)}
-        bilou = ["-" for _ in tokens]
-        # Handle entity cases
-        for start_char, end_char, label in entities:
-            start_token = starts.get(start_char)
-            end_token = ends.get(end_char)
-            # Only interested if the tokenization is correct
-            if start_token is not None and end_token is not None:
-                if start_token == end_token:
-                    bilou[start_token] = "U-%s" % label
-                else:
-                    bilou[start_token] = "B-%s" % label
-                    for i in range(start_token + 1, end_token):
-                        bilou[i] = "I-%s" % label
-                    bilou[end_token] = "L-%s" % label
-        # Now distinguish the O cases from ones where we miss the tokenization
-        entity_chars = set()
-        for start_char, end_char, label in entities:
-            for i in range(start_char, end_char):
-                entity_chars.add(i)
-        for n, token in enumerate(tokens):
-            for i in range(token.start, token.end):
-                if i in entity_chars:
-                    break
-            else:
-                bilou[n] = missing
-
-        return bilou
-
     @staticmethod
     def __pattern_of_token(message: Message, i: int) -> Dict:
-        if message.get(TOKENS_NAMES[TEXT_ATTRIBUTE]) is not None:
-            return message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])[i].get("pattern", {})
+        if message.get(TOKENS_NAMES[TEXT]) is not None:
+            return message.get(TOKENS_NAMES[TEXT])[i].get("pattern", {})
         else:
             return {}
 
-    @staticmethod
-    def __tag_of_token(token: Any) -> Text:
-        if spacy.about.__version__ > "2" and token._.has("tag"):
-            return token._.get("tag")
-        else:
-            return token.tag_
-
     @staticmethod
     def __get_dense_features(message: Message) -> Optional[List[Any]]:
-        features = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+        features = message.get(DENSE_FEATURE_NAMES[TEXT])
 
         if features is None:
             return None
 
-        tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE], [])
+        tokens = message.get(TOKENS_NAMES[TEXT], [])
         if len(tokens) != len(features):
-            raise_warning(
+            common_utils.raise_warning(
                 f"Number of features ({len(features)}) for attribute "
-                f"'{DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]}' "
-                f"does not match number of tokens ({len(tokens)}).",
+                f"'{DENSE_FEATURE_NAMES[TEXT]}' "
+                f"does not match number of tokens ({len(tokens)}). Set "
+                f"'return_sequence' to true in the corresponding featurizer in order "
+                f"to make use of the features in 'CRFEntityExtractor'.",
                 docs=DOCS_URL_COMPONENTS + "#crfentityextractor",
             )
             return None
@@ -623,17 +520,14 @@ def _from_text_to_crf(
         """Takes a sentence and switches it to crfsuite format."""
 
         crf_format = []
-        if self.pos_features:
-            tokens = message.get(SPACY_DOCS[TEXT_ATTRIBUTE])
-        else:
-            tokens = self._tokens_without_cls(message)
+        tokens = self._tokens_without_cls(message)
 
         text_dense_features = self.__get_dense_features(message)
 
         for i, token in enumerate(tokens):
             pattern = self.__pattern_of_token(message, i)
             entity = entities[i] if entities else "N/A"
-            tag = self.__tag_of_token(token) if self.pos_features else None
+            tag = token.get(POS_TAG_KEY)
             dense_features = (
                 text_dense_features[i] if text_dense_features is not None else []
             )
diff --git a/rasa/nlu/extractors/duckling_http_extractor.py b/rasa/nlu/extractors/duckling_http_extractor.py
index ec57abfd4eb2..a783faff7154 100644
--- a/rasa/nlu/extractors/duckling_http_extractor.py
+++ b/rasa/nlu/extractors/duckling_http_extractor.py
@@ -6,9 +6,9 @@
 from typing import Any, List, Optional, Text, Dict
 
 from rasa.constants import DOCS_URL_COMPONENTS
-from rasa.nlu.constants import ENTITIES_ATTRIBUTE
+from rasa.nlu.constants import ENTITIES
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.extractors import EntityExtractor
+from rasa.nlu.extractors.extractor import EntityExtractor
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message
 from rasa.utils.common import raise_warning
@@ -53,8 +53,6 @@ def convert_duckling_format_to_rasa(
 class DucklingHTTPExtractor(EntityExtractor):
     """Searches for structured entites, e.g. dates, using a duckling server."""
 
-    provides = [ENTITIES_ATTRIBUTE]
-
     defaults = {
         # by default all dimensions recognized by duckling are returned
         # dimensions can be configured to contain an array of strings
@@ -189,9 +187,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         extracted = self.add_extractor_name(extracted)
         message.set(
-            ENTITIES_ATTRIBUTE,
-            message.get(ENTITIES_ATTRIBUTE, []) + extracted,
-            add_to_output=True,
+            ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True,
         )
 
     @classmethod
diff --git a/rasa/nlu/extractors/entity_synonyms.py b/rasa/nlu/extractors/entity_synonyms.py
index 00388a92b99c..5d30d7d16d95 100644
--- a/rasa/nlu/extractors/entity_synonyms.py
+++ b/rasa/nlu/extractors/entity_synonyms.py
@@ -2,9 +2,9 @@
 from typing import Any, Dict, Optional, Text
 
 from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
-from rasa.nlu.constants import ENTITIES_ATTRIBUTE
+from rasa.nlu.constants import ENTITIES
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.extractors import EntityExtractor
+from rasa.nlu.extractors.extractor import EntityExtractor
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.utils import write_json_to_file
@@ -13,9 +13,6 @@
 
 
 class EntitySynonymMapper(EntityExtractor):
-
-    provides = [ENTITIES_ATTRIBUTE]
-
     def __init__(
         self,
         component_config: Optional[Dict[Text, Any]] = None,
@@ -27,22 +24,25 @@ def __init__(
         self.synonyms = synonyms if synonyms else {}
 
     def train(
-        self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
     ) -> None:
 
         for key, value in list(training_data.entity_synonyms.items()):
             self.add_entities_if_synonyms(key, value)
 
         for example in training_data.entity_examples:
-            for entity in example.get(ENTITIES_ATTRIBUTE, []):
+            for entity in example.get(ENTITIES, []):
                 entity_val = example.text[entity["start"] : entity["end"]]
                 self.add_entities_if_synonyms(entity_val, str(entity.get("value")))
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
-        updated_entities = message.get(ENTITIES_ATTRIBUTE, [])[:]
+        updated_entities = message.get(ENTITIES, [])[:]
         self.replace_synonyms(updated_entities)
-        message.set(ENTITIES_ATTRIBUTE, updated_entities, add_to_output=True)
+        message.set(ENTITIES, updated_entities, add_to_output=True)
 
     def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
 
diff --git a/rasa/nlu/extractors/extractor.py b/rasa/nlu/extractors/extractor.py
new file mode 100644
index 000000000000..bcdf16ffb366
--- /dev/null
+++ b/rasa/nlu/extractors/extractor.py
@@ -0,0 +1,90 @@
+from typing import Any, Dict, List, Text, Tuple
+
+from rasa.nlu.components import Component
+from rasa.nlu.constants import EXTRACTOR, ENTITIES
+from rasa.nlu.training_data import Message
+
+
+class EntityExtractor(Component):
+    def add_extractor_name(
+        self, entities: List[Dict[Text, Any]]
+    ) -> List[Dict[Text, Any]]:
+        for entity in entities:
+            entity[EXTRACTOR] = self.name
+        return entities
+
+    def add_processor_name(self, entity: Dict[Text, Any]) -> Dict[Text, Any]:
+        if "processors" in entity:
+            entity["processors"].append(self.name)
+        else:
+            entity["processors"] = [self.name]
+
+        return entity
+
+    @staticmethod
+    def filter_irrelevant_entities(extracted: list, requested_dimensions: set) -> list:
+        """Only return dimensions the user configured"""
+
+        if requested_dimensions:
+            return [
+                entity
+                for entity in extracted
+                if entity["entity"] in requested_dimensions
+            ]
+        else:
+            return extracted
+
+    @staticmethod
+    def find_entity(ent, text, tokens) -> Tuple[int, int]:
+        offsets = [token.start for token in tokens]
+        ends = [token.end for token in tokens]
+
+        if ent["start"] not in offsets:
+            message = (
+                "Invalid entity {} in example '{}': "
+                "entities must span whole tokens. "
+                "Wrong entity start.".format(ent, text)
+            )
+            raise ValueError(message)
+
+        if ent["end"] not in ends:
+            message = (
+                "Invalid entity {} in example '{}': "
+                "entities must span whole tokens. "
+                "Wrong entity end.".format(ent, text)
+            )
+            raise ValueError(message)
+
+        start = offsets.index(ent["start"])
+        end = ends.index(ent["end"]) + 1
+        return start, end
+
+    def filter_trainable_entities(
+        self, entity_examples: List[Message]
+    ) -> List[Message]:
+        """Filters out untrainable entity annotations.
+
+        Creates a copy of entity_examples in which entities that have
+        `extractor` set to something other than
+        self.name (e.g. 'CRFEntityExtractor') are removed.
+        """
+
+        filtered = []
+        for message in entity_examples:
+            entities = []
+            for ent in message.get(ENTITIES, []):
+                extractor = ent.get(EXTRACTOR)
+                if not extractor or extractor == self.name:
+                    entities.append(ent)
+            data = message.data.copy()
+            data[ENTITIES] = entities
+            filtered.append(
+                Message(
+                    text=message.text,
+                    data=data,
+                    output_properties=message.output_properties,
+                    time=message.time,
+                )
+            )
+
+        return filtered
diff --git a/rasa/nlu/extractors/mitie_entity_extractor.py b/rasa/nlu/extractors/mitie_entity_extractor.py
index 640a78282ee8..22dd94eb9139 100644
--- a/rasa/nlu/extractors/mitie_entity_extractor.py
+++ b/rasa/nlu/extractors/mitie_entity_extractor.py
@@ -1,13 +1,15 @@
 import logging
 import os
 import typing
-from typing import Any, Dict, List, Optional, Text
+from typing import Any, Dict, List, Optional, Text, Type
 
-from rasa.nlu.constants import ENTITIES_ATTRIBUTE, TOKENS_NAMES, TEXT_ATTRIBUTE
+from rasa.nlu.constants import ENTITIES, TOKENS_NAMES, TEXT
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.extractors import EntityExtractor
+from rasa.nlu.utils.mitie_utils import MitieNLP
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
+from rasa.nlu.components import Component
+from rasa.nlu.extractors.extractor import EntityExtractor
 from rasa.nlu.model import Metadata
-from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.utils.common import raise_warning
 
@@ -18,10 +20,9 @@
 
 
 class MitieEntityExtractor(EntityExtractor):
-
-    provides = [ENTITIES_ATTRIBUTE]
-
-    requires = [TOKENS_NAMES[TEXT_ATTRIBUTE], "mitie_feature_extractor", "mitie_file"]
+    @classmethod
+    def required_components(cls) -> List[Type[Component]]:
+        return [MitieNLP, Tokenizer]
 
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None, ner=None):
         """Construct a new intent classifier using the sklearn framework."""
@@ -36,7 +37,7 @@ def required_packages(cls) -> List[Text]:
     @staticmethod
     def _tokens_without_cls(message: Message) -> List[Token]:
         # [:-1] to remove the CLS token from the list of tokens
-        return message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])[:-1]
+        return message.get(TOKENS_NAMES[TEXT])[:-1]
 
     def extract_entities(
         self, text: Text, tokens: List[Token], feature_extractor
@@ -63,7 +64,10 @@ def extract_entities(
         return ents
 
     def train(
-        self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
     ) -> None:
         import mitie
 
@@ -100,7 +104,7 @@ def _prepare_mitie_sample(self, training_example: Message) -> Any:
         text = training_example.text
         tokens = self._tokens_without_cls(training_example)
         sample = mitie.ner_training_instance([t.text for t in tokens])
-        for ent in training_example.get(ENTITIES_ATTRIBUTE, []):
+        for ent in training_example.get(ENTITIES, []):
             try:
                 # if the token is not aligned an exception will be raised
                 start, end = MitieEntityExtractor.find_entity(ent, text, tokens)
@@ -139,9 +143,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
         )
         extracted = self.add_extractor_name(ents)
         message.set(
-            ENTITIES_ATTRIBUTE,
-            message.get(ENTITIES_ATTRIBUTE, []) + extracted,
-            add_to_output=True,
+            ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True,
         )
 
     @classmethod
diff --git a/rasa/nlu/extractors/spacy_entity_extractor.py b/rasa/nlu/extractors/spacy_entity_extractor.py
index 8c0bf9e79322..749445acb887 100644
--- a/rasa/nlu/extractors/spacy_entity_extractor.py
+++ b/rasa/nlu/extractors/spacy_entity_extractor.py
@@ -1,8 +1,10 @@
 import typing
-from typing import Any, Dict, List, Text, Optional
+from typing import Any, Dict, List, Text, Optional, Type
 
-from rasa.nlu.constants import ENTITIES_ATTRIBUTE
-from rasa.nlu.extractors import EntityExtractor
+from rasa.nlu.constants import ENTITIES
+from rasa.nlu.utils.spacy_utils import SpacyNLP
+from rasa.nlu.components import Component
+from rasa.nlu.extractors.extractor import EntityExtractor
 from rasa.nlu.training_data import Message
 
 if typing.TYPE_CHECKING:
@@ -10,10 +12,9 @@
 
 
 class SpacyEntityExtractor(EntityExtractor):
-
-    provides = [ENTITIES_ATTRIBUTE]
-
-    requires = ["spacy_nlp"]
+    @classmethod
+    def required_components(cls) -> List[Type[Component]]:
+        return [SpacyNLP]
 
     defaults = {
         # by default all dimensions recognized by spacy are returned
@@ -36,9 +37,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
             all_extracted, dimensions
         )
         message.set(
-            ENTITIES_ATTRIBUTE,
-            message.get(ENTITIES_ATTRIBUTE, []) + extracted,
-            add_to_output=True,
+            ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True,
         )
 
     @staticmethod
diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 390b0575e10a..1fa9de8d3210 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -1,13 +1,16 @@
 import logging
-from typing import Any, Dict, List, Optional, Text, Tuple
+from typing import Any, Dict, List, Optional, Text, Tuple, Type
+from tqdm import tqdm
 
 from rasa.constants import DOCS_URL_COMPONENTS
 from rasa.nlu.tokenizers.tokenizer import Token
-from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.components import Component
+from rasa.nlu.featurizers.featurizer import DenseFeaturizer
+from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
-    TEXT_ATTRIBUTE,
+    TEXT,
     TOKENS_NAMES,
     DENSE_FEATURE_NAMES,
     DENSE_FEATURIZABLE_ATTRIBUTES,
@@ -15,52 +18,40 @@
 import numpy as np
 import tensorflow as tf
 
-from rasa.utils.common import raise_warning
+import rasa.utils.train_utils as train_utils
+import rasa.utils.common as common_utils
 
 logger = logging.getLogger(__name__)
 
 
-class ConveRTFeaturizer(Featurizer):
+class ConveRTFeaturizer(DenseFeaturizer):
+    """Featurizer using ConveRT model.
 
-    provides = [
-        DENSE_FEATURE_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
-    ]
+    Loads the ConveRT(https://github.com/PolyAI-LDN/polyai-models#convert)
+    model from TFHub and computes sentence and sequence level feature representations
+    for dense featurizable attributes of each message object.
+    """
 
-    requires = [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
-
-    def _load_model(self) -> None:
-
-        # needed in order to load model
-        import tensorflow_text
-        import tensorflow_hub as tfhub
-
-        self.graph = tf.Graph()
-        model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
-
-        with self.graph.as_default():
-            self.session = tf.Session()
-            self.module = tfhub.Module(model_url)
-
-            self.text_placeholder = tf.placeholder(dtype=tf.string, shape=[None])
-            self.sentence_encoding_tensor = self.module(self.text_placeholder)
-            self.sequence_encoding_tensor = self.module(
-                self.text_placeholder, signature="encode_sequence", as_dict=True
-            )
-            self.session.run(tf.tables_initializer())
-            self.session.run(tf.global_variables_initializer())
+    @classmethod
+    def required_components(cls) -> List[Type[Component]]:
+        return [ConveRTTokenizer]
 
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
 
         super(ConveRTFeaturizer, self).__init__(component_config)
 
-        self._load_model()
+        model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
+        self.module = train_utils.load_tf_hub_model(model_url)
+
+        self.sentence_encoding_signature = self.module.signatures["default"]
+        self.sequence_encoding_signature = self.module.signatures["encode_sequence"]
 
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["tensorflow_text", "tensorflow_hub"]
 
     def _compute_features(
-        self, batch_examples: List[Message], attribute: Text = TEXT_ATTRIBUTE
+        self, batch_examples: List[Message], attribute: Text = TEXT
     ) -> np.ndarray:
 
         sentence_encodings = self._compute_sentence_encodings(batch_examples, attribute)
@@ -75,7 +66,7 @@ def _compute_features(
         )
 
     def _compute_sentence_encodings(
-        self, batch_examples: List[Message], attribute: Text = TEXT_ATTRIBUTE
+        self, batch_examples: List[Message], attribute: Text = TEXT
     ) -> np.ndarray:
         # Get text for attribute of each example
         batch_attribute_text = [ex.get(attribute) for ex in batch_examples]
@@ -85,7 +76,7 @@ def _compute_sentence_encodings(
         return np.reshape(sentence_encodings, (len(batch_examples), 1, -1))
 
     def _compute_sequence_encodings(
-        self, batch_examples: List[Message], attribute: Text = TEXT_ATTRIBUTE
+        self, batch_examples: List[Message], attribute: Text = TEXT
     ) -> Tuple[np.ndarray, List[int]]:
         list_of_tokens = [
             example.get(TOKENS_NAMES[attribute]) for example in batch_examples
@@ -159,24 +150,26 @@ def _tokens_to_text(list_of_tokens: List[List[Token]]) -> List[Text]:
         return texts
 
     def _sentence_encoding_of_text(self, batch: List[Text]) -> np.ndarray:
-        return self.session.run(
-            self.sentence_encoding_tensor, feed_dict={self.text_placeholder: batch}
-        )
+
+        return self.sentence_encoding_signature(tf.convert_to_tensor(batch))[
+            "default"
+        ].numpy()
 
     def _sequence_encoding_of_text(self, batch: List[Text]) -> np.ndarray:
-        return self.session.run(
-            self.sequence_encoding_tensor, feed_dict={self.text_placeholder: batch}
-        )["sequence_encoding"]
+
+        return self.sequence_encoding_signature(tf.convert_to_tensor(batch))[
+            "sequence_encoding"
+        ].numpy()
 
     def train(
         self,
         training_data: TrainingData,
-        config: Optional[RasaNLUModelConfig],
+        config: Optional[RasaNLUModelConfig] = None,
         **kwargs: Any,
     ) -> None:
 
         if config is not None and config.language != "en":
-            raise_warning(
+            common_utils.raise_warning(
                 f"Since ``ConveRT`` model is trained only on an english "
                 f"corpus of conversations, this featurizer should only be "
                 f"used if your training data is in english language. "
@@ -192,10 +185,11 @@ def train(
                 filter(lambda x: x.get(attribute), training_data.training_examples)
             )
 
-            batch_start_index = 0
-
-            while batch_start_index < len(non_empty_examples):
-
+            progress_bar = tqdm(
+                range(0, len(non_empty_examples), batch_size),
+                desc=attribute.capitalize() + " batches",
+            )
+            for batch_start_index in progress_bar:
                 batch_end_index = min(
                     batch_start_index + batch_size, len(non_empty_examples)
                 )
@@ -206,7 +200,6 @@ def train(
                 batch_features = self._compute_features(batch_examples, attribute)
 
                 for index, ex in enumerate(batch_examples):
-
                     ex.set(
                         DENSE_FEATURE_NAMES[attribute],
                         self._combine_with_existing_dense_features(
@@ -214,14 +207,12 @@ def train(
                         ),
                     )
 
-                batch_start_index += batch_size
-
     def process(self, message: Message, **kwargs: Any) -> None:
 
         features = self._compute_features([message])[0]
         message.set(
-            DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE],
+            DENSE_FEATURE_NAMES[TEXT],
             self._combine_with_existing_dense_features(
-                message, features, DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
+                message, features, DENSE_FEATURE_NAMES[TEXT]
             ),
         )
diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
new file mode 100644
index 000000000000..5afaceec2fb0
--- /dev/null
+++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
@@ -0,0 +1,70 @@
+import numpy as np
+from typing import Any, Optional, Text, List, Type
+
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.components import Component
+from rasa.nlu.featurizers.featurizer import DenseFeaturizer
+from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
+from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer
+from rasa.nlu.training_data import Message, TrainingData
+from rasa.nlu.constants import (
+    TEXT,
+    LANGUAGE_MODEL_DOCS,
+    DENSE_FEATURE_NAMES,
+    DENSE_FEATURIZABLE_ATTRIBUTES,
+    SEQUENCE_FEATURES,
+    SENTENCE_FEATURES,
+)
+
+
+class LanguageModelFeaturizer(DenseFeaturizer):
+    """Featurizer using transformer based language models.
+
+    Uses the output of HFTransformersNLP component to set the sequence and sentence
+    level representations for dense featurizable attributes of each message object.
+    """
+
+    @classmethod
+    def required_components(cls) -> List[Type[Component]]:
+        return [HFTransformersNLP, LanguageModelTokenizer]
+
+    def train(
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
+    ) -> None:
+
+        for example in training_data.training_examples:
+            for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
+                self._set_lm_features(example, attribute)
+
+    def _get_doc(self, message: Message, attribute: Text) -> Any:
+        """
+        Get the language model doc. A doc consists of
+        {'token_ids': ..., 'tokens': ...,
+        'sequence_features': ..., 'sentence_features': ...}
+        """
+        return message.get(LANGUAGE_MODEL_DOCS[attribute])
+
+    def process(self, message: Message, **kwargs: Any) -> None:
+        """Sets the dense features from the language model doc to the incoming
+        message."""
+        self._set_lm_features(message)
+
+    def _set_lm_features(self, message: Message, attribute: Text = TEXT) -> None:
+        """Adds the precomputed word vectors to the messages features."""
+        doc = self._get_doc(message, attribute)
+
+        if doc is None:
+            return
+
+        sequence_features = doc[SEQUENCE_FEATURES]
+        sentence_features = doc[SENTENCE_FEATURES]
+
+        features = np.concatenate([sequence_features, sentence_features])
+
+        features = self._combine_with_existing_dense_features(
+            message, features, DENSE_FEATURE_NAMES[attribute]
+        )
+        message.set(DENSE_FEATURE_NAMES[attribute], features)
diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index 8fd4ad68a9db..cb286d495b7f 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -1,31 +1,40 @@
 import numpy as np
 import typing
-from typing import Any, List, Text
+from typing import Any, List, Text, Optional, Dict, Type
 
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.featurizers.featurizer import Featurizer
-from rasa.nlu.tokenizers.tokenizer import Token
+from rasa.nlu.components import Component
+from rasa.nlu.featurizers.featurizer import DenseFeaturizer
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
+from rasa.nlu.utils.mitie_utils import MitieNLP
 from rasa.nlu.training_data import Message, TrainingData
-
-if typing.TYPE_CHECKING:
-    import mitie
-
 from rasa.nlu.constants import (
-    TEXT_ATTRIBUTE,
+    TEXT,
     TOKENS_NAMES,
-    MESSAGE_ATTRIBUTES,
     DENSE_FEATURE_NAMES,
     DENSE_FEATURIZABLE_ATTRIBUTES,
 )
+from rasa.utils.tensorflow.constants import MEAN_POOLING, POOLING
+
+if typing.TYPE_CHECKING:
+    import mitie
 
 
-class MitieFeaturizer(Featurizer):
+class MitieFeaturizer(DenseFeaturizer):
+    @classmethod
+    def required_components(cls) -> List[Type[Component]]:
+        return [MitieNLP, Tokenizer]
 
-    provides = [DENSE_FEATURE_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
+    defaults = {
+        # Specify what pooling operation should be used to calculate the vector of
+        # the CLS token. Available options: 'mean' and 'max'
+        POOLING: MEAN_POOLING
+    }
 
-    requires = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES] + [
-        "mitie_feature_extractor"
-    ]
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
+        super().__init__(component_config)
+
+        self.pooling_operation = self.component_config["pooling"]
 
     @classmethod
     def required_packages(cls) -> List[Text]:
@@ -39,7 +48,10 @@ def get_tokens_by_attribute(self, example: Message, attribute: Text) -> Any:
         return example.get(TOKENS_NAMES[attribute])
 
     def train(
-        self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
     ) -> None:
 
         mitie_feature_extractor = self._mitie_feature_extractor(**kwargs)
@@ -68,12 +80,12 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         mitie_feature_extractor = self._mitie_feature_extractor(**kwargs)
         features = self.features_for_tokens(
-            message.get(TOKENS_NAMES[TEXT_ATTRIBUTE]), mitie_feature_extractor
+            message.get(TOKENS_NAMES[TEXT]), mitie_feature_extractor
         )
         message.set(
-            DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE],
+            DENSE_FEATURE_NAMES[TEXT],
             self._combine_with_existing_dense_features(
-                message, features, DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
+                message, features, DENSE_FEATURE_NAMES[TEXT]
             ),
         )
 
@@ -104,7 +116,7 @@ def features_for_tokens(
             features.append(feature_extractor.get_feature_vector(token.text))
         features = np.array(features)
 
-        cls_token_vec = np.mean(features, axis=0, keepdims=True)
+        cls_token_vec = self._calculate_cls_vector(features, self.pooling_operation)
         features = np.concatenate([features, cls_token_vec])
 
         return features
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 07ba118bd65f..dad98049427a 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -1,32 +1,40 @@
 import numpy as np
 import typing
-from typing import Any, Optional, Text
+from typing import Any, Optional, Text, Dict, List, Type
 
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.components import Component
+from rasa.nlu.featurizers.featurizer import DenseFeaturizer
+from rasa.nlu.utils.spacy_utils import SpacyNLP
+from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.training_data import Message, TrainingData
-
-if typing.TYPE_CHECKING:
-    from spacy.tokens import Doc
-
 from rasa.nlu.constants import (
-    TEXT_ATTRIBUTE,
+    TEXT,
     SPACY_DOCS,
     DENSE_FEATURE_NAMES,
     DENSE_FEATURIZABLE_ATTRIBUTES,
-    TOKENS_NAMES,
 )
+from rasa.utils.tensorflow.constants import POOLING, MEAN_POOLING
+
+if typing.TYPE_CHECKING:
+    from spacy.tokens import Doc
+
 
+class SpacyFeaturizer(DenseFeaturizer):
+    @classmethod
+    def required_components(cls) -> List[Type[Component]]:
+        return [SpacyNLP, SpacyTokenizer]
 
-class SpacyFeaturizer(Featurizer):
+    defaults = {
+        # Specify what pooling operation should be used to calculate the vector of
+        # the CLS token. Available options: 'mean' and 'max'
+        POOLING: MEAN_POOLING
+    }
 
-    provides = [
-        DENSE_FEATURE_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
-    ]
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None):
+        super().__init__(component_config)
 
-    requires = [
-        SPACY_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
-    ] + [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
+        self.pooling_operation = self.component_config[POOLING]
 
     def _features_for_doc(self, doc: "Doc") -> np.ndarray:
         """Feature vector for a single document / sentence / tokens."""
@@ -35,7 +43,7 @@ def _features_for_doc(self, doc: "Doc") -> np.ndarray:
     def train(
         self,
         training_data: TrainingData,
-        config: Optional[RasaNLUModelConfig],
+        config: Optional[RasaNLUModelConfig] = None,
         **kwargs: Any,
     ) -> None:
 
@@ -51,7 +59,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         self._set_spacy_features(message)
 
-    def _set_spacy_features(self, message: Message, attribute: Text = TEXT_ATTRIBUTE):
+    def _set_spacy_features(self, message: Message, attribute: Text = TEXT):
         """Adds the spacy word vectors to the messages features."""
 
         message_attribute_doc = self.get_doc(message, attribute)
@@ -59,7 +67,7 @@ def _set_spacy_features(self, message: Message, attribute: Text = TEXT_ATTRIBUTE
         if message_attribute_doc is not None:
             features = self._features_for_doc(message_attribute_doc)
 
-            cls_token_vec = np.mean(features, axis=0, keepdims=True)
+            cls_token_vec = self._calculate_cls_vector(features, self.pooling_operation)
             features = np.concatenate([features, cls_token_vec])
 
             features = self._combine_with_existing_dense_features(
diff --git a/rasa/nlu/featurizers/featurizer.py b/rasa/nlu/featurizers/featurizer.py
index 269e02f38a04..ef896a2a5ab8 100644
--- a/rasa/nlu/featurizers/featurizer.py
+++ b/rasa/nlu/featurizers/featurizer.py
@@ -3,7 +3,8 @@
 from typing import Any, Text, Union, Optional
 from rasa.nlu.training_data import Message
 from rasa.nlu.components import Component
-from rasa.nlu.constants import SPARSE_FEATURE_NAMES, DENSE_FEATURE_NAMES, TEXT_ATTRIBUTE
+from rasa.nlu.constants import SPARSE_FEATURE_NAMES, DENSE_FEATURE_NAMES, TEXT
+from rasa.utils.tensorflow.constants import MEAN_POOLING, MAX_POOLING
 
 
 def sequence_to_sentence_features(
@@ -24,11 +25,15 @@ def sequence_to_sentence_features(
 
 
 class Featurizer(Component):
+    pass
+
+
+class DenseFeaturizer(Featurizer):
     @staticmethod
     def _combine_with_existing_dense_features(
         message: Message,
         additional_features: Any,
-        feature_name: Text = DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE],
+        feature_name: Text = DENSE_FEATURE_NAMES[TEXT],
     ) -> Any:
         if message.get(feature_name) is not None:
 
@@ -45,12 +50,39 @@ def _combine_with_existing_dense_features(
         else:
             return additional_features
 
+    @staticmethod
+    def _calculate_cls_vector(
+        features: np.ndarray, pooling_operation: Text
+    ) -> np.ndarray:
+        # take only non zeros feature vectors into account
+        non_zero_features = np.array([f for f in features if f.any()])
+
+        # if features are all zero just return a vector with all zeros
+        if non_zero_features.size == 0:
+            return np.zeros([1, features.shape[-1]])
+
+        if pooling_operation == MEAN_POOLING:
+            return np.mean(non_zero_features, axis=0, keepdims=True)
+        elif pooling_operation == MAX_POOLING:
+            return np.max(non_zero_features, axis=0, keepdims=True)
+        else:
+            raise ValueError(
+                f"Invalid pooling operation specified. Available operations are "
+                f"'{MEAN_POOLING}' or '{MAX_POOLING}', but provided value is "
+                f"'{pooling_operation}'."
+            )
+
+
+class SparseFeaturizer(Featurizer):
     @staticmethod
     def _combine_with_existing_sparse_features(
         message: Message,
         additional_features: Any,
-        feature_name: Text = SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE],
+        feature_name: Text = SPARSE_FEATURE_NAMES[TEXT],
     ) -> Any:
+        if additional_features is None:
+            return
+
         if message.get(feature_name) is not None:
             from scipy.sparse import hstack
 
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 26408c8525d8..cf5d33dfd4e0 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -2,31 +2,32 @@
 import os
 import re
 import scipy.sparse
-from typing import Any, Dict, List, Optional, Text
+from typing import Any, Dict, List, Optional, Text, Type
 
 from rasa.constants import DOCS_URL_COMPONENTS
-from rasa.utils.common import raise_warning
-
+import rasa.utils.common as common_utils
+import rasa.utils.io as io_utils
 from sklearn.feature_extraction.text import CountVectorizer
-from rasa.nlu import utils
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from rasa.nlu.components import Component
+from rasa.nlu.featurizers.featurizer import SparseFeaturizer
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
-    TEXT_ATTRIBUTE,
+    TEXT,
     TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
     SPARSE_FEATURE_NAMES,
-    INTENT_ATTRIBUTE,
+    INTENT,
     DENSE_FEATURIZABLE_ATTRIBUTES,
-    RESPONSE_ATTRIBUTE,
+    RESPONSE,
 )
 
 logger = logging.getLogger(__name__)
 
 
-class CountVectorsFeaturizer(Featurizer):
+class CountVectorsFeaturizer(SparseFeaturizer):
     """Creates a sequence of token counts features based on sklearn's `CountVectorizer`.
 
     All tokens which consist only of digits (e.g. 123 and 99
@@ -37,9 +38,9 @@ class CountVectorsFeaturizer(Featurizer):
     from https://arxiv.org/abs/1810.07150.
     """
 
-    provides = [SPARSE_FEATURE_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
-
-    requires = [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
+    @classmethod
+    def required_components(cls) -> List[Type[Component]]:
+        return [Tokenizer]
 
     defaults = {
         # whether to use a shared vocab
@@ -222,12 +223,10 @@ def _get_message_tokens_by_attribute(
 
         return message.get(attribute).split()
 
-    def _process_tokens(
-        self, tokens: List[Text], attribute: Text = TEXT_ATTRIBUTE
-    ) -> List[Text]:
+    def _process_tokens(self, tokens: List[Text], attribute: Text = TEXT) -> List[Text]:
         """Apply processing and cleaning steps to text"""
 
-        if attribute == INTENT_ATTRIBUTE:
+        if attribute == INTENT:
             # Don't do any processing for intent attribute. Treat them as whole labels
             return tokens
 
@@ -264,7 +263,7 @@ def _replace_with_oov_token(
         return tokens
 
     def _get_processed_message_tokens_by_attribute(
-        self, message: "Message", attribute: Text = TEXT_ATTRIBUTE
+        self, message: Message, attribute: Text = TEXT
     ) -> List[Text]:
         """Get processed text of attribute of a message"""
 
@@ -294,7 +293,7 @@ def _check_OOV_present(self, all_tokens: List[List[Text]]) -> None:
 
         if any(text for tokens in all_tokens for text in tokens):
             # if there is some text in tokens, warn if there is no oov token
-            raise_warning(
+            common_utils.raise_warning(
                 f"The out of vocabulary token '{self.OOV_token}' was configured, but "
                 f"could not be found in any one of the NLU message training examples. "
                 f"All unseen words will be ignored during prediction.",
@@ -302,7 +301,7 @@ def _check_OOV_present(self, all_tokens: List[List[Text]]) -> None:
             )
 
     def _get_all_attributes_processed_tokens(
-        self, training_data: "TrainingData"
+        self, training_data: TrainingData
     ) -> Dict[Text, List[List[Text]]]:
         """Get processed text for all attributes of examples in training data"""
 
@@ -327,7 +326,7 @@ def _convert_attribute_tokens_to_texts(
 
         for attribute in attribute_tokens.keys():
             list_of_tokens = attribute_tokens[attribute]
-            if attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE]:
+            if attribute in [RESPONSE, TEXT]:
                 # vocabulary should not contain CLS token
                 list_of_tokens = [tokens[:-1] for tokens in list_of_tokens]
             attribute_texts[attribute] = [" ".join(tokens) for tokens in list_of_tokens]
@@ -357,7 +356,7 @@ def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]):
             combined_cleaned_texts += attribute_texts[attribute]
 
         try:
-            self.vectorizers[TEXT_ATTRIBUTE].fit(combined_cleaned_texts)
+            self.vectorizers[TEXT].fit(combined_cleaned_texts)
         except ValueError:
             logger.warning(
                 "Unable to train a shared CountVectorizer. "
@@ -404,16 +403,21 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]])
 
     def _create_sequence(
         self, attribute: Text, all_tokens: List[List[Text]]
-    ) -> List[scipy.sparse.coo_matrix]:
+    ) -> List[Optional[scipy.sparse.coo_matrix]]:
         X = []
 
         for i, tokens in enumerate(all_tokens):
+            if not tokens:
+                # nothing to featurize
+                X.append(None)
+                continue
+
             # vectorizer.transform returns a sparse matrix of size
             # [n_samples, n_features]
             # set input to list of tokens if sequence should be returned
             # otherwise join all tokens to a single string and pass that as a list
             tokens_without_cls = tokens
-            if attribute in [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]:
+            if attribute in [TEXT, RESPONSE]:
                 tokens_without_cls = tokens[:-1]
 
             if not tokens_without_cls:
@@ -424,7 +428,7 @@ def _create_sequence(
             seq_vec = self.vectorizers[attribute].transform(tokens_without_cls)
             seq_vec.sort_indices()
 
-            if attribute in [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]:
+            if attribute in [TEXT, RESPONSE]:
                 tokens_text = [" ".join(tokens_without_cls)]
                 cls_vec = self.vectorizers[attribute].transform(tokens_text)
                 cls_vec.sort_indices()
@@ -439,7 +443,7 @@ def _create_sequence(
 
     def _get_featurized_attribute(
         self, attribute: Text, all_tokens: List[List[Text]]
-    ) -> Optional[List[scipy.sparse.coo_matrix]]:
+    ) -> Optional[List[Optional[scipy.sparse.coo_matrix]]]:
         """Return features of a particular attribute for complete data"""
 
         if self._check_attribute_vocabulary(attribute):
@@ -449,7 +453,7 @@ def _get_featurized_attribute(
             return None
 
     def _set_attribute_features(
-        self, attribute: Text, attribute_features: List, training_data: "TrainingData"
+        self, attribute: Text, attribute_features: List, training_data: TrainingData
     ) -> None:
         """Set computed features of the attribute to corresponding message objects"""
         for i, example in enumerate(training_data.training_examples):
@@ -514,7 +518,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
             )
             return
 
-        attribute = TEXT_ATTRIBUTE
+        attribute = TEXT
         message_tokens = self._get_processed_message_tokens_by_attribute(
             message, attribute
         )
@@ -565,18 +569,18 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]
                 if self.use_shared_vocab:
                     # Only persist vocabulary from one attribute. Can be loaded and
                     # distributed to all attributes.
-                    vocab = attribute_vocabularies[TEXT_ATTRIBUTE]
+                    vocab = attribute_vocabularies[TEXT]
                 else:
                     vocab = attribute_vocabularies
 
-                utils.json_pickle(featurizer_file, vocab)
+                io_utils.json_pickle(featurizer_file, vocab)
 
         return {"file": file_name}
 
     @classmethod
     def _create_shared_vocab_vectorizers(
         cls, parameters: Dict[Text, Any], vocabulary: Optional[Any] = None
-    ) -> Dict[Text, "CountVectorizer"]:
+    ) -> Dict[Text, CountVectorizer]:
         """Create vectorizers for all attributes with shared vocabulary"""
 
         shared_vectorizer = CountVectorizer(
@@ -602,7 +606,7 @@ def _create_shared_vocab_vectorizers(
     @classmethod
     def _create_independent_vocab_vectorizers(
         cls, parameters: Dict[Text, Any], vocabulary: Optional[Any] = None
-    ) -> Dict[Text, "CountVectorizer"]:
+    ) -> Dict[Text, CountVectorizer]:
         """Create vectorizers for all attributes with independent vocabulary"""
 
         attribute_vectorizers = {}
@@ -643,7 +647,7 @@ def load(
         if not os.path.exists(featurizer_file):
             return cls(meta)
 
-        vocabulary = utils.json_unpickle(featurizer_file)
+        vocabulary = io_utils.json_unpickle(featurizer_file)
 
         share_vocabulary = meta["use_shared_vocab"]
 
diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
new file mode 100644
index 000000000000..bf0ddb39e308
--- /dev/null
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -0,0 +1,297 @@
+import logging
+from collections import defaultdict, OrderedDict
+from pathlib import Path
+
+import numpy as np
+from typing import Any, Dict, Optional, Text, List, Type, Union
+
+from rasa.nlu.tokenizers.spacy_tokenizer import POS_TAG_KEY
+from rasa.constants import DOCS_URL_COMPONENTS
+from rasa.nlu.components import Component
+from rasa.nlu.tokenizers.tokenizer import Token
+from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from rasa.nlu.featurizers.featurizer import SparseFeaturizer
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.training_data import Message, TrainingData
+from rasa.nlu.constants import TOKENS_NAMES, TEXT, SPARSE_FEATURE_NAMES
+from rasa.nlu.model import Metadata
+import rasa.utils.io as io_utils
+
+logger = logging.getLogger(__name__)
+
+END_OF_SENTENCE = "EOS"
+BEGIN_OF_SENTENCE = "BOS"
+
+
+class LexicalSyntacticFeaturizer(SparseFeaturizer):
+    """Creates features for entity extraction.
+
+    Moves with a sliding window over every token in the user message and creates
+    features according to the configuration.
+    """
+
+    @classmethod
+    def required_components(cls) -> List[Type[Component]]:
+        return [Tokenizer]
+
+    defaults = {
+        # 'features' is [before, word, after] array with before, word,
+        # after holding keys about which features to use for each word,
+        # for example, 'title' in array before will have the feature
+        # "is the preceding word in title case?"
+        # POS features require 'SpacyTokenizer'.
+        "features": [
+            ["low", "title", "upper"],
+            ["BOS", "EOS", "low", "upper", "title", "digit"],
+            ["low", "title", "upper"],
+        ]
+    }
+
+    function_dict = {
+        "low": lambda token: token.text.islower(),
+        "title": lambda token: token.text.istitle(),
+        "prefix5": lambda token: token.text[:5],
+        "prefix2": lambda token: token.text[:2],
+        "suffix5": lambda token: token.text[-5:],
+        "suffix3": lambda token: token.text[-3:],
+        "suffix2": lambda token: token.text[-2:],
+        "suffix1": lambda token: token.text[-1:],
+        "pos": lambda token: token.data.get(POS_TAG_KEY)
+        if POS_TAG_KEY in token.data
+        else None,
+        "pos2": lambda token: token.data.get(POS_TAG_KEY)[:2]
+        if "pos" in token.data
+        else None,
+        "upper": lambda token: token.text.isupper(),
+        "digit": lambda token: token.text.isdigit(),
+    }
+
+    def __init__(
+        self,
+        component_config: Dict[Text, Any],
+        feature_to_idx_dict: Optional[Dict[Text, Any]] = None,
+    ):
+        super().__init__(component_config)
+
+        self.feature_to_idx_dict = feature_to_idx_dict or {}
+        self.number_of_features = self._calculate_number_of_features()
+
+    def _calculate_number_of_features(self) -> int:
+        return sum(
+            [
+                len(feature_values.values())
+                for feature_values in self.feature_to_idx_dict.values()
+            ]
+        )
+
+    def train(
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
+    ) -> None:
+        self.feature_to_idx_dict = self._create_feature_to_idx_dict(training_data)
+        self.number_of_features = self._calculate_number_of_features()
+
+        for example in training_data.training_examples:
+            self._create_sparse_features(example)
+
+    def process(self, message: Message, **kwargs: Any) -> None:
+        self._create_sparse_features(message)
+
+    def _create_feature_to_idx_dict(
+        self, training_data: TrainingData
+    ) -> Dict[Text, Dict[Text, int]]:
+        """Create dictionary of all feature values.
+
+        Each feature key, defined in the component configuration, points to
+        different feature values and their indices in the overall resulting
+        feature vector.
+        """
+
+        # get all possible feature values
+        all_features = []
+        for example in training_data.training_examples:
+            # [:-1] to remove CLS token
+            tokens_without_cls = example.get(TOKENS_NAMES[TEXT])[:-1]
+            all_features.append(self._tokens_to_features(tokens_without_cls))
+
+        # build vocabulary of features
+        feature_vocabulary = self._build_feature_vocabulary(all_features)
+
+        # assign a unique index to each feature value
+        return self._map_features_to_indices(feature_vocabulary)
+
+    @staticmethod
+    def _map_features_to_indices(
+        feature_vocabulary: Dict[Text, List[Text]]
+    ) -> Dict[Text, Dict[Text, int]]:
+        feature_to_idx_dict = {}
+        offset = 0
+
+        for feature_name, feature_values in feature_vocabulary.items():
+            feature_to_idx_dict[feature_name] = {
+                str(feature_value): feature_idx
+                for feature_idx, feature_value in enumerate(
+                    sorted(feature_values), start=offset
+                )
+            }
+            offset += len(feature_values)
+
+        return feature_to_idx_dict
+
+    @staticmethod
+    def _build_feature_vocabulary(
+        features: List[List[Dict[Text, Any]]]
+    ) -> Dict[Text, List[Text]]:
+        feature_vocabulary = defaultdict(set)
+
+        for sentence_features in features:
+            for token_features in sentence_features:
+                for feature_name, feature_value in token_features.items():
+                    feature_vocabulary[feature_name].add(feature_value)
+
+        # sort items to ensure same order every time (for tests)
+        feature_vocabulary = OrderedDict(sorted(feature_vocabulary.items()))
+
+        return feature_vocabulary
+
+    def _create_sparse_features(self, message: Message) -> None:
+        """Convert incoming messages into sparse features using the configured
+        features."""
+        import scipy.sparse
+
+        # [:-1] to remove CLS token
+        tokens = message.get(TOKENS_NAMES[TEXT])[:-1]
+
+        sentence_features = self._tokens_to_features(tokens)
+        one_hot_feature_vector = self._features_to_one_hot(sentence_features)
+
+        sparse_features = scipy.sparse.coo_matrix(one_hot_feature_vector)
+
+        sparse_features = self._combine_with_existing_sparse_features(
+            message, sparse_features, feature_name=SPARSE_FEATURE_NAMES[TEXT]
+        )
+        message.set(SPARSE_FEATURE_NAMES[TEXT], sparse_features)
+
+    def _tokens_to_features(self, tokens: List[Token]) -> List[Dict[Text, Any]]:
+        """Convert words into discrete features."""
+
+        configured_features = self.component_config["features"]
+        sentence_features = []
+
+        for token_idx in range(len(tokens)):
+            # get the window size (e.g. before, word, after) of the configured features
+            # in case of an even number we will look at one more word before,
+            # e.g. window size 4 will result in a window range of
+            # [-2, -1, 0, 1] (0 = current word in sentence)
+            window_size = len(configured_features)
+            half_window_size = window_size // 2
+            window_range = range(-half_window_size, half_window_size + window_size % 2)
+
+            prefixes = [str(i) for i in window_range]
+
+            token_features = {}
+
+            for pointer_position in window_range:
+                current_idx = token_idx + pointer_position
+
+                # skip, if current_idx is pointing to a non-existing token
+                if current_idx < 0 or current_idx >= len(tokens):
+                    continue
+
+                token = tokens[token_idx + pointer_position]
+
+                current_feature_idx = pointer_position + half_window_size
+                prefix = prefixes[current_feature_idx]
+
+                for feature in configured_features[current_feature_idx]:
+                    token_features[f"{prefix}:{feature}"] = self._get_feature_value(
+                        feature, token, token_idx, pointer_position, len(tokens)
+                    )
+
+            sentence_features.append(token_features)
+
+        return sentence_features
+
+    def _features_to_one_hot(
+        self, sentence_features: List[Dict[Text, Any]]
+    ) -> np.ndarray:
+        """Convert the word features into a one-hot presentation using the indices
+        in the feature-to-idx dictionary."""
+
+        # +1 for CLS token
+        one_hot_feature_vector = np.zeros(
+            [len(sentence_features) + 1, self.number_of_features]
+        )
+
+        for token_idx, token_features in enumerate(sentence_features):
+            for feature_name, feature_value in token_features.items():
+                feature_value_str = str(feature_value)
+                if (
+                    feature_name in self.feature_to_idx_dict
+                    and feature_value_str in self.feature_to_idx_dict[feature_name]
+                ):
+                    feature_idx = self.feature_to_idx_dict[feature_name][
+                        feature_value_str
+                    ]
+                    one_hot_feature_vector[token_idx][feature_idx] = 1
+
+        # set vector of CLS token to sum of everything
+        one_hot_feature_vector[-1] = np.sum(one_hot_feature_vector, axis=0)
+
+        return one_hot_feature_vector
+
+    def _get_feature_value(
+        self,
+        feature: Text,
+        token: Token,
+        token_idx: int,
+        pointer_position: int,
+        token_length: int,
+    ) -> Union[bool, int, Text]:
+        if feature == END_OF_SENTENCE:
+            return token_idx + pointer_position == token_length - 1
+
+        if feature == BEGIN_OF_SENTENCE:
+            return token_idx + pointer_position == 0
+
+        if feature not in self.function_dict:
+            raise ValueError(
+                f"Configured feature '{feature}' not valid. Please check "
+                f"'{DOCS_URL_COMPONENTS}' for valid configuration parameters."
+            )
+
+        value = self.function_dict[feature](token)
+        if value is None:
+            logger.debug(
+                f"Invalid value '{value}' for feature '{feature}'."
+                f" Feature is ignored."
+            )
+        return value
+
+    @classmethod
+    def load(
+        cls,
+        meta: Dict[Text, Any],
+        model_dir: Optional[Text] = None,
+        model_metadata: Optional[Metadata] = None,
+        cached_component: Optional["LexicalSyntacticFeaturizer"] = None,
+        **kwargs: Any,
+    ) -> "LexicalSyntacticFeaturizer":
+
+        file_name = meta.get("file")
+
+        feature_to_idx_file = Path(model_dir) / f"{file_name}.feature_to_idx_dict.pkl"
+        feature_to_idx_dict = io_utils.json_unpickle(feature_to_idx_file)
+
+        return LexicalSyntacticFeaturizer(meta, feature_to_idx_dict=feature_to_idx_dict)
+
+    def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
+        """Persist this model into the passed directory.
+        Return the metadata necessary to load the model again."""
+
+        feature_to_idx_file = Path(model_dir) / f"{file_name}.feature_to_idx_dict.pkl"
+        io_utils.json_pickle(feature_to_idx_file, self.feature_to_idx_dict)
+
+        return {"file": file_name}
diff --git a/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
index 60badfb41dc1..5c80f9d803a8 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
@@ -2,12 +2,12 @@
 
 from typing import Any, Dict, Optional, Text
 
-from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.featurizers.featurizer import SparseFeaturizer
 
 logger = logging.getLogger(__name__)
 
 
-class NGramFeaturizer(Featurizer):
+class NGramFeaturizer(SparseFeaturizer):
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         super(NGramFeaturizer, self).__init__(component_config)
 
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index eaa571bfd577..c1af343d64e0 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -1,8 +1,7 @@
 import logging
 import os
 import re
-import typing
-from typing import Any, Dict, List, Optional, Text, Union
+from typing import Any, Dict, List, Optional, Text, Union, Type
 
 import numpy as np
 
@@ -14,26 +13,25 @@
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.constants import (
     CLS_TOKEN,
-    RESPONSE_ATTRIBUTE,
+    RESPONSE,
     SPARSE_FEATURE_NAMES,
-    TEXT_ATTRIBUTE,
+    TEXT,
     TOKENS_NAMES,
 )
-from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from rasa.nlu.components import Component
+from rasa.nlu.featurizers.featurizer import SparseFeaturizer
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.utils.common import raise_warning
+import rasa.utils.common as common_utils
+from rasa.nlu.model import Metadata
 
 logger = logging.getLogger(__name__)
 
-if typing.TYPE_CHECKING:
-    from rasa.nlu.model import Metadata
 
-
-class RegexFeaturizer(Featurizer):
-
-    provides = [SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]]
-
-    requires = [TOKENS_NAMES[TEXT_ATTRIBUTE]]
+class RegexFeaturizer(SparseFeaturizer):
+    @classmethod
+    def required_components(cls) -> List[Type[Component]]:
+        return [Tokenizer]
 
     def __init__(
         self,
@@ -49,18 +47,21 @@ def __init__(
         self._add_lookup_table_regexes(lookup_tables)
 
     def train(
-        self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
     ) -> None:
 
         self.known_patterns = training_data.regex_features
         self._add_lookup_table_regexes(training_data.lookup_tables)
 
         for example in training_data.training_examples:
-            for attribute in [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]:
+            for attribute in [TEXT, RESPONSE]:
                 self._text_features_with_regex(example, attribute)
 
     def process(self, message: Message, **kwargs: Any) -> None:
-        self._text_features_with_regex(message, TEXT_ATTRIBUTE)
+        self._text_features_with_regex(message, TEXT)
 
     def _text_features_with_regex(self, message: Message, attribute: Text) -> None:
         if self.known_patterns:
@@ -94,6 +95,11 @@ def _features_for_patterns(
             return None
 
         tokens = message.get(TOKENS_NAMES[attribute], [])
+
+        if not tokens:
+            # nothing to featurize
+            return
+
         seq_length = len(tokens)
 
         vec = np.zeros([seq_length, len(self.known_patterns)])
@@ -116,7 +122,7 @@ def _features_for_patterns(
                     if t.start < match.end() and t.end > match.start():
                         patterns[pattern["name"]] = True
                         vec[token_index][pattern_index] = 1.0
-                        if attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE]:
+                        if attribute in [RESPONSE, TEXT]:
                             # CLS token vector should contain all patterns
                             vec[-1][pattern_index] = 1.0
 
@@ -134,7 +140,7 @@ def _generate_lookup_regex(
         # if it's a list, it should be the elements directly
         if isinstance(lookup_elements, list):
             elements_to_regex = lookup_elements
-            raise_warning(
+            common_utils.raise_warning(
                 f"Directly including lookup tables as a list is deprecated since Rasa "
                 f"1.6.",
                 FutureWarning,
@@ -170,7 +176,7 @@ def load(
         cls,
         meta: Dict[Text, Any],
         model_dir: Optional[Text] = None,
-        model_metadata: Optional["Metadata"] = None,
+        model_metadata: Optional[Metadata] = None,
         cached_component: Optional["RegexFeaturizer"] = None,
         **kwargs: Any,
     ) -> "RegexFeaturizer":
diff --git a/rasa/nlu/model.py b/rasa/nlu/model.py
index 6f96ce7f76bf..0c86fe9137cc 100644
--- a/rasa/nlu/model.py
+++ b/rasa/nlu/model.py
@@ -117,10 +117,8 @@ class Trainer:
     """Trainer will load the data and train all components.
 
     Requires a pipeline specification and configuration to use for
-    the training."""
-
-    # Officially supported languages (others might be used, but might fail)
-    SUPPORTED_LANGUAGES = ["de", "en"]
+    the training.
+    """
 
     def __init__(
         self,
@@ -146,11 +144,11 @@ def __init__(
         # build pipeline
         self.pipeline = self._build_pipeline(cfg, component_builder)
 
-    @staticmethod
     def _build_pipeline(
-        cfg: RasaNLUModelConfig, component_builder: ComponentBuilder
+        self, cfg: RasaNLUModelConfig, component_builder: ComponentBuilder
     ) -> List[Component]:
-        """Transform the passed names of the pipeline components into classes"""
+        """Transform the passed names of the pipeline components into classes."""
+
         pipeline = []
 
         # Transform the passed names of the pipeline components into classes
@@ -159,6 +157,9 @@ def _build_pipeline(
             component = component_builder.create_component(component_cfg, cfg)
             pipeline.append(component)
 
+        if not self.skip_validation:
+            components.validate_pipeline(pipeline)
+
         return pipeline
 
     def train(self, data: TrainingData, **kwargs: Any) -> "Interpreter":
@@ -177,7 +178,6 @@ def train(self, data: TrainingData, **kwargs: Any) -> "Interpreter":
 
         # Before the training starts: check that all arguments are provided
         if not self.skip_validation:
-            components.validate_arguments(self.pipeline, context)
             components.validate_required_components_from_data(
                 self.pipeline, self.training_data
             )
diff --git a/rasa/nlu/persistor.py b/rasa/nlu/persistor.py
index 43fdefc7178d..6d572655892d 100644
--- a/rasa/nlu/persistor.py
+++ b/rasa/nlu/persistor.py
@@ -113,12 +113,19 @@ class AWSPersistor(Persistor):
 
     Fetches them when needed, instead of storing them on the local disk."""
 
-    def __init__(self, bucket_name: Text, endpoint_url: Optional[Text] = None) -> None:
+    def __init__(
+        self,
+        bucket_name: Text,
+        endpoint_url: Optional[Text] = None,
+        region_name: Optional[Text] = None,
+    ) -> None:
         import boto3
 
         super().__init__()
-        self.s3 = boto3.resource("s3", endpoint_url=endpoint_url)
-        self._ensure_bucket_exists(bucket_name)
+        self.s3 = boto3.resource(
+            "s3", endpoint_url=endpoint_url, region_name=region_name
+        )
+        self._ensure_bucket_exists(bucket_name, region_name)
         self.bucket_name = bucket_name
         self.bucket = self.s3.Bucket(bucket_name)
 
@@ -132,11 +139,16 @@ def list_models(self) -> List[Text]:
             logger.warning(f"Failed to list models in AWS. {e}")
             return []
 
-    def _ensure_bucket_exists(self, bucket_name: Text) -> None:
+    def _ensure_bucket_exists(
+        self, bucket_name: Text, region_name: Optional[Text] = None
+    ) -> None:
         import boto3
         import botocore
 
-        bucket_config = {"LocationConstraint": boto3.DEFAULT_SESSION.region_name}
+        if not region_name:
+            region_name = boto3.DEFAULT_SESSION.region_name
+
+        bucket_config = {"LocationConstraint": region_name}
         # noinspection PyUnresolvedReferences
         try:
             self.s3.create_bucket(
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 3bc286cf82bd..f00cd71e823f 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -9,32 +9,45 @@
 from typing import Any, Dict, List, Optional, Text, Type
 
 from rasa.constants import DOCS_URL_COMPONENTS
-from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
+
+from rasa.nlu.classifiers.diet_classifier import DIETClassifier
 from rasa.nlu.classifiers.keyword_intent_classifier import KeywordIntentClassifier
 from rasa.nlu.classifiers.mitie_intent_classifier import MitieIntentClassifier
 from rasa.nlu.classifiers.sklearn_intent_classifier import SklearnIntentClassifier
+from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
 from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
 from rasa.nlu.extractors.duckling_http_extractor import DucklingHTTPExtractor
 from rasa.nlu.extractors.entity_synonyms import EntitySynonymMapper
 from rasa.nlu.extractors.mitie_entity_extractor import MitieEntityExtractor
 from rasa.nlu.extractors.spacy_entity_extractor import SpacyEntityExtractor
+from rasa.nlu.featurizers.sparse_featurizer.lexical_syntactic_featurizer import (
+    LexicalSyntacticFeaturizer,
+)
 from rasa.nlu.featurizers.dense_featurizer.convert_featurizer import ConveRTFeaturizer
 from rasa.nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer
 from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
 from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
     CountVectorsFeaturizer,
 )
+from rasa.nlu.featurizers.dense_featurizer.lm_featurizer import LanguageModelFeaturizer
 from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 from rasa.nlu.model import Metadata
-from rasa.nlu.selectors.embedding_response_selector import ResponseSelector
+from rasa.nlu.selectors.response_selector import ResponseSelector
 from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
 from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer
 from rasa.nlu.utils.mitie_utils import MitieNLP
 from rasa.nlu.utils.spacy_utils import SpacyNLP
+from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
 from rasa.utils.common import class_from_module_path, raise_warning
+from rasa.utils.tensorflow.constants import (
+    INTENT_CLASSIFICATION,
+    ENTITY_RECOGNITION,
+    NUM_TRANSFORMER_LAYERS,
+)
 
 if typing.TYPE_CHECKING:
     from rasa.nlu.components import Component
@@ -49,12 +62,14 @@
     # utils
     SpacyNLP,
     MitieNLP,
+    HFTransformersNLP,
     # tokenizers
     MitieTokenizer,
     SpacyTokenizer,
     WhitespaceTokenizer,
     ConveRTTokenizer,
     JiebaTokenizer,
+    LanguageModelTokenizer,
     # extractors
     SpacyEntityExtractor,
     MitieEntityExtractor,
@@ -65,12 +80,15 @@
     SpacyFeaturizer,
     MitieFeaturizer,
     RegexFeaturizer,
+    LexicalSyntacticFeaturizer,
     CountVectorsFeaturizer,
     ConveRTFeaturizer,
+    LanguageModelFeaturizer,
     # classifiers
     SklearnIntentClassifier,
     MitieIntentClassifier,
     KeywordIntentClassifier,
+    DIETClassifier,
     EmbeddingIntentClassifier,
     # selectors
     ResponseSelector,
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
deleted file mode 100644
index 3dcac57b5a41..000000000000
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ /dev/null
@@ -1,190 +0,0 @@
-import logging
-import typing
-from typing import Any, Dict, Text
-
-from rasa.nlu.components import any_of
-from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
-from rasa.nlu.constants import (
-    RESPONSE_ATTRIBUTE,
-    RESPONSE_SELECTOR_PROPERTY_NAME,
-    DEFAULT_OPEN_UTTERANCE_TYPE,
-    DENSE_FEATURE_NAMES,
-    TEXT_ATTRIBUTE,
-    SPARSE_FEATURE_NAMES,
-)
-
-logger = logging.getLogger(__name__)
-
-if typing.TYPE_CHECKING:
-    from rasa.nlu.training_data import Message
-
-import tensorflow as tf
-
-# avoid warning println on contrib import - remove for tf 2
-tf.contrib._warning = None
-
-
-class ResponseSelector(EmbeddingIntentClassifier):
-    """Response selector using supervised embeddings.
-
-    The response selector embeds user inputs
-    and candidate response into the same space.
-    Supervised embeddings are trained by maximizing similarity between them.
-    It also provides rankings of the response that did not "win".
-
-    The supervised response selector needs to be preceded by
-    a featurizer in the pipeline.
-    This featurizer creates the features used for the embeddings.
-    It is recommended to use ``CountVectorsFeaturizer`` that
-    can be optionally preceded by ``SpacyNLP`` and ``SpacyTokenizer``.
-
-    Based on the starspace idea from: https://arxiv.org/abs/1709.03856.
-    However, in this implementation the `mu` parameter is treated differently
-    and additional hidden layers are added together with dropout.
-    """
-
-    provides = [RESPONSE_ATTRIBUTE, "response_ranking"]
-
-    requires = [
-        any_of(
-            DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE], SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
-        ),
-        any_of(
-            DENSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE],
-            SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE],
-        ),
-    ]
-
-    # default properties (DOC MARKER - don't remove)
-    defaults = {
-        # nn architecture
-        # sizes of hidden layers before the embedding layer for input words
-        # the number of hidden layers is thus equal to the length of this list
-        "hidden_layers_sizes_a": [256, 128],
-        # sizes of hidden layers before the embedding layer for intent labels
-        # the number of hidden layers is thus equal to the length of this list
-        "hidden_layers_sizes_b": [256, 128],
-        # Whether to share the hidden layer weights between input words and intent labels
-        "share_hidden_layers": False,
-        # training parameters
-        # initial and final batch sizes - batch size will be
-        # linearly increased for each epoch
-        "batch_size": [64, 256],
-        # how to create batches
-        "batch_strategy": "balanced",  # string 'sequence' or 'balanced'
-        # number of epochs
-        "epochs": 300,
-        # set random seed to any int to get reproducible results
-        "random_seed": None,
-        # embedding parameters
-        # default dense dimension used if no dense features are present
-        "dense_dim": {"text": 512, "label": 20},
-        # dimension size of embedding vectors
-        "embed_dim": 20,
-        # the type of the similarity
-        "num_neg": 20,
-        # flag if minimize only maximum similarity over incorrect actions
-        "similarity_type": "auto",  # string 'auto' or 'cosine' or 'inner'
-        # the type of the loss function
-        "loss_type": "softmax",  # string 'softmax' or 'margin'
-        # number of top responses to normalize scores for softmax loss_type
-        # set to 0 to turn off normalization
-        "ranking_length": 10,
-        # how similar the algorithm should try
-        # to make embedding vectors for correct intent labels
-        "mu_pos": 0.8,  # should be 0.0 < ... < 1.0 for 'cosine'
-        # maximum negative similarity for incorrect intent labels
-        "mu_neg": -0.4,  # should be -1.0 < ... < 1.0 for 'cosine'
-        # flag: if true, only minimize the maximum similarity for
-        # incorrect intent labels
-        "use_max_sim_neg": True,
-        # scale loss inverse proportionally to confidence of correct prediction
-        "scale_loss": True,
-        # regularization parameters
-        # the scale of L2 regularization
-        "C2": 0.002,
-        # the scale of how critical the algorithm should be of minimizing the
-        # maximum similarity between embeddings of different intent labels
-        "C_emb": 0.8,
-        # dropout rate for rnn
-        "droprate": 0.2,
-        # visualization of accuracy
-        # how often to calculate training accuracy
-        "evaluate_every_num_epochs": 20,  # small values may hurt performance
-        # how many examples to use for calculation of training accuracy
-        "evaluate_on_num_examples": 0,  # large values may hurt performance,
-        # selector config
-        # name of the intent for which this response selector is to be trained
-        "retrieval_intent": None,
-    }
-    # end default properties (DOC MARKER - don't remove)
-
-    def _load_selector_params(self, config: Dict[Text, Any]):
-        self.retrieval_intent = config["retrieval_intent"]
-        if not self.retrieval_intent:
-            # retrieval intent was left to its default value
-            logger.info(
-                "Retrieval intent parameter was left to its default value. This response selector will be trained"
-                "on training examples combining all retrieval intents."
-            )
-
-    def _load_params(self) -> None:
-        super()._load_params()
-        self._load_selector_params(self.component_config)
-
-    @staticmethod
-    def _set_message_property(
-        message: "Message", prediction_dict: Dict[Text, Any], selector_key: Text
-    ):
-
-        message_selector_properties = message.get(RESPONSE_SELECTOR_PROPERTY_NAME, {})
-        message_selector_properties[selector_key] = prediction_dict
-        message.set(
-            RESPONSE_SELECTOR_PROPERTY_NAME,
-            message_selector_properties,
-            add_to_output=True,
-        )
-
-    def preprocess_train_data(self, training_data):
-        """Performs sanity checks on training data, extracts encodings for labels
-        and prepares data for training"""
-        if self.retrieval_intent:
-            training_data = training_data.filter_by_intent(self.retrieval_intent)
-
-        label_id_dict = self._create_label_id_dict(
-            training_data, attribute=RESPONSE_ATTRIBUTE
-        )
-
-        self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
-        self._label_data = self._create_label_data(
-            training_data, label_id_dict, attribute=RESPONSE_ATTRIBUTE
-        )
-
-        session_data = self._create_session_data(
-            training_data.intent_examples,
-            label_id_dict,
-            label_attribute=RESPONSE_ATTRIBUTE,
-        )
-
-        self.check_input_dimension_consistency(session_data)
-
-        return session_data
-
-    def process(self, message: "Message", **kwargs: Any) -> None:
-        """Return the most likely response and its similarity to the input."""
-
-        label, label_ranking = self.predict_label(message)
-
-        selector_key = (
-            self.retrieval_intent
-            if self.retrieval_intent
-            else DEFAULT_OPEN_UTTERANCE_TYPE
-        )
-
-        logger.debug(
-            f"Adding following selector key to message property: {selector_key}"
-        )
-
-        prediction_dict = {"response": label, "ranking": label_ranking}
-
-        self._set_message_property(message, prediction_dict, selector_key)
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
new file mode 100644
index 000000000000..4fc79b60889b
--- /dev/null
+++ b/rasa/nlu/selectors/response_selector.py
@@ -0,0 +1,446 @@
+import logging
+
+import numpy as np
+import tensorflow as tf
+
+from typing import Any, Dict, Optional, Text, Tuple, Union, List, Type
+
+from rasa.nlu.config import InvalidConfigError
+from rasa.nlu.training_data import TrainingData, Message
+from rasa.nlu.components import Component
+from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.classifiers.diet_classifier import (
+    DIETClassifier,
+    DIET,
+    TEXT_FEATURES,
+    LABEL_FEATURES,
+    TEXT_MASK,
+    LABEL_MASK,
+    LABEL_IDS,
+)
+from rasa.utils.tensorflow.constants import (
+    LABEL,
+    HIDDEN_LAYERS_SIZES,
+    SHARE_HIDDEN_LAYERS,
+    TRANSFORMER_SIZE,
+    NUM_TRANSFORMER_LAYERS,
+    NUM_HEADS,
+    BATCH_SIZES,
+    BATCH_STRATEGY,
+    EPOCHS,
+    RANDOM_SEED,
+    LEARNING_RATE,
+    DENSE_DIMENSION,
+    RANKING_LENGTH,
+    LOSS_TYPE,
+    SIMILARITY_TYPE,
+    NUM_NEG,
+    SPARSE_INPUT_DROPOUT,
+    MASKED_LM,
+    ENTITY_RECOGNITION,
+    INTENT_CLASSIFICATION,
+    EVAL_NUM_EXAMPLES,
+    EVAL_NUM_EPOCHS,
+    UNIDIRECTIONAL_ENCODER,
+    DROP_RATE,
+    DROP_RATE_ATTENTION,
+    WEIGHT_SPARSITY,
+    NEGATIVE_MARGIN_SCALE,
+    REGULARIZATION_CONSTANT,
+    SCALE_LOSS,
+    USE_MAX_NEG_SIM,
+    MAX_NEG_SIM,
+    MAX_POS_SIM,
+    EMBEDDING_DIMENSION,
+    BILOU_FLAG,
+    KEY_RELATIVE_ATTENTION,
+    VALUE_RELATIVE_ATTENTION,
+    MAX_RELATIVE_POSITION,
+    RETRIEVAL_INTENT,
+    SOFTMAX,
+    AUTO,
+    BALANCED,
+)
+from rasa.nlu.constants import (
+    RESPONSE,
+    RESPONSE_SELECTOR_PROPERTY_NAME,
+    DEFAULT_OPEN_UTTERANCE_TYPE,
+    TEXT,
+)
+from rasa.utils.tensorflow.model_data import RasaModelData
+from rasa.utils.tensorflow.models import RasaModel
+
+logger = logging.getLogger(__name__)
+
+
+class ResponseSelector(DIETClassifier):
+    """Response selector using supervised embeddings.
+
+    The response selector embeds user inputs
+    and candidate response into the same space.
+    Supervised embeddings are trained by maximizing similarity between them.
+    It also provides rankings of the response that did not "win".
+
+    The supervised response selector needs to be preceded by
+    a featurizer in the pipeline.
+    This featurizer creates the features used for the embeddings.
+    It is recommended to use ``CountVectorsFeaturizer`` that
+    can be optionally preceded by ``SpacyNLP`` and ``SpacyTokenizer``.
+
+    Based on the starspace idea from: https://arxiv.org/abs/1709.03856.
+    However, in this implementation the `mu` parameter is treated differently
+    and additional hidden layers are added together with dropout.
+    """
+
+    @classmethod
+    def required_components(cls) -> List[Type[Component]]:
+        return [Featurizer]
+
+    defaults = {
+        # ## Architecture of the used neural network
+        # Hidden layer sizes for layers before the embedding layers for user message
+        # and labels.
+        # The number of hidden layers is equal to the length of the corresponding
+        # list.
+        HIDDEN_LAYERS_SIZES: {TEXT: [256, 128], LABEL: [256, 128]},
+        # Whether to share the hidden layer weights between input words and responses
+        SHARE_HIDDEN_LAYERS: False,
+        # Number of units in transformer
+        TRANSFORMER_SIZE: None,
+        # Number of transformer layers
+        NUM_TRANSFORMER_LAYERS: 0,
+        # Number of attention heads in transformer
+        NUM_HEADS: 4,
+        # If 'True' use key relative embeddings in attention
+        KEY_RELATIVE_ATTENTION: False,
+        # If 'True' use key relative embeddings in attention
+        VALUE_RELATIVE_ATTENTION: False,
+        # Max position for relative embeddings
+        MAX_RELATIVE_POSITION: None,
+        # Use a unidirectional or bidirectional encoder.
+        UNIDIRECTIONAL_ENCODER: False,
+        # ## Training parameters
+        # Initial and final batch sizes:
+        # Batch size will be linearly increased for each epoch.
+        BATCH_SIZES: [64, 256],
+        # Strategy used when creating batches.
+        # Can be either 'sequence' or 'balanced'.
+        BATCH_STRATEGY: BALANCED,
+        # Number of epochs to train
+        EPOCHS: 300,
+        # Set random seed to any 'int' to get reproducible results
+        RANDOM_SEED: None,
+        # Initial learning rate for the optimizer
+        LEARNING_RATE: 0.001,
+        # ## Parameters for embeddings
+        # Dimension size of embedding vectors
+        EMBEDDING_DIMENSION: 20,
+        # Default dense dimension to use if no dense features are present.
+        DENSE_DIMENSION: {TEXT: 512, LABEL: 512},
+        # The number of incorrect labels. The algorithm will minimize
+        # their similarity to the user input during training.
+        NUM_NEG: 20,
+        # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
+        SIMILARITY_TYPE: AUTO,
+        # The type of the loss function, either 'softmax' or 'margin'.
+        LOSS_TYPE: SOFTMAX,
+        # Number of top actions to normalize scores for loss type 'softmax'.
+        # Set to 0 to turn off normalization.
+        RANKING_LENGTH: 10,
+        # Indicates how similar the algorithm should try to make embedding vectors
+        # for correct labels.
+        # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_POS_SIM: 0.8,
+        # Maximum negative similarity for incorrect labels.
+        # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
+        MAX_NEG_SIM: -0.4,
+        # If 'True' the algorithm only minimizes maximum similarity over
+        # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
+        USE_MAX_NEG_SIM: True,
+        # Scale loss inverse proportionally to confidence of correct prediction
+        SCALE_LOSS: True,
+        # ## Regularization parameters
+        # The scale of regularization
+        REGULARIZATION_CONSTANT: 0.002,
+        # Sparsity of the weights in dense layers
+        WEIGHT_SPARSITY: 0.8,
+        # The scale of how important is to minimize the maximum similarity
+        # between embeddings of different labels.
+        NEGATIVE_MARGIN_SCALE: 0.8,
+        # Dropout rate for encoder
+        DROP_RATE: 0.2,
+        # Dropout rate for attention
+        DROP_RATE_ATTENTION: 0,
+        # If 'True' apply dropout to sparse tensors
+        SPARSE_INPUT_DROPOUT: False,
+        # ## Evaluation parameters
+        # How often calculate validation accuracy.
+        # Small values may hurt performance, e.g. model accuracy.
+        EVAL_NUM_EPOCHS: 20,
+        # How many examples to use for hold out validation set
+        # Large values may hurt performance, e.g. model accuracy.
+        EVAL_NUM_EXAMPLES: 0,
+        # ## Selector config
+        # If 'True' random tokens of the input message will be masked and the model
+        # should predict those tokens.
+        MASKED_LM: False,
+        # Name of the intent for which this response selector is to be trained
+        RETRIEVAL_INTENT: None,
+    }
+
+    def __init__(
+        self,
+        component_config: Optional[Dict[Text, Any]] = None,
+        index_label_id_mapping: Optional[Dict[int, Text]] = None,
+        index_tag_id_mapping: Optional[Dict[int, Text]] = None,
+        model: Optional[RasaModel] = None,
+    ) -> None:
+
+        component_config = component_config or {}
+
+        # the following properties cannot be adapted for the ResponseSelector
+        component_config[INTENT_CLASSIFICATION] = True
+        component_config[ENTITY_RECOGNITION] = False
+        component_config[BILOU_FLAG] = None
+
+        super().__init__(
+            component_config, index_label_id_mapping, index_tag_id_mapping, model
+        )
+
+    @property
+    def label_key(self) -> Text:
+        return LABEL_IDS
+
+    @staticmethod
+    def model_class() -> Type[RasaModel]:
+        return DIET2DIET
+
+    def _load_selector_params(self, config: Dict[Text, Any]) -> None:
+        self.retrieval_intent = config[RETRIEVAL_INTENT]
+        if not self.retrieval_intent:
+            # retrieval intent was left to its default value
+            logger.info(
+                "Retrieval intent parameter was left to its default value. This "
+                "response selector will be trained on training examples combining "
+                "all retrieval intents."
+            )
+
+    def _check_config_parameters(self) -> None:
+        super()._check_config_parameters()
+        self._load_selector_params(self.component_config)
+
+    @staticmethod
+    def _set_message_property(
+        message: Message, prediction_dict: Dict[Text, Any], selector_key: Text
+    ) -> None:
+        message_selector_properties = message.get(RESPONSE_SELECTOR_PROPERTY_NAME, {})
+        message_selector_properties[selector_key] = prediction_dict
+        message.set(
+            RESPONSE_SELECTOR_PROPERTY_NAME,
+            message_selector_properties,
+            add_to_output=True,
+        )
+
+    def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
+        """Prepares data for training.
+
+        Performs sanity checks on training data, extracts encodings for labels.
+        """
+
+        if self.retrieval_intent:
+            training_data = training_data.filter_by_intent(self.retrieval_intent)
+
+        label_id_index_mapping = self._label_id_index_mapping(
+            training_data, attribute=RESPONSE
+        )
+
+        if not label_id_index_mapping:
+            # no labels are present to train
+            return RasaModelData()
+
+        self.index_label_id_mapping = self._invert_mapping(label_id_index_mapping)
+
+        self._label_data = self._create_label_data(
+            training_data, label_id_index_mapping, attribute=RESPONSE
+        )
+
+        model_data = self._create_model_data(
+            training_data.intent_examples,
+            label_id_index_mapping,
+            label_attribute=RESPONSE,
+        )
+
+        self._check_input_dimension_consistency(model_data)
+
+        return model_data
+
+    def process(self, message: Message, **kwargs: Any) -> None:
+        """Return the most likely response and its similarity to the input."""
+
+        out = self._predict(message)
+        label, label_ranking = self._predict_label(out)
+
+        selector_key = (
+            self.retrieval_intent
+            if self.retrieval_intent
+            else DEFAULT_OPEN_UTTERANCE_TYPE
+        )
+
+        logger.debug(
+            f"Adding following selector key to message property: {selector_key}"
+        )
+
+        prediction_dict = {"response": label, "ranking": label_ranking}
+
+        self._set_message_property(message, prediction_dict, selector_key)
+
+
+class DIET2DIET(DIET):
+    def _check_data(self) -> None:
+        if TEXT_FEATURES not in self.data_signature:
+            raise InvalidConfigError(
+                f"No text features specified. "
+                f"Cannot train '{self.__class__.__name__}' model."
+            )
+        if LABEL_FEATURES not in self.data_signature:
+            raise InvalidConfigError(
+                f"No label features specified. "
+                f"Cannot train '{self.__class__.__name__}' model."
+            )
+        if (
+            self.config[SHARE_HIDDEN_LAYERS]
+            and self.data_signature[TEXT_FEATURES]
+            != self.data_signature[LABEL_FEATURES]
+        ):
+            raise ValueError(
+                "If hidden layer weights are shared, data signatures "
+                "for text_features and label_features must coincide."
+            )
+
+    def _create_metrics(self) -> None:
+        # self.metrics preserve order
+        # output losses first
+        self.mask_loss = tf.keras.metrics.Mean(name="m_loss")
+        self.response_loss = tf.keras.metrics.Mean(name="r_loss")
+        # output accuracies second
+        self.mask_acc = tf.keras.metrics.Mean(name="m_acc")
+        self.response_acc = tf.keras.metrics.Mean(name="r_acc")
+
+    def _update_metrics_to_log(self) -> None:
+        if self.config[MASKED_LM]:
+            self.metrics_to_log += ["m_loss", "m_acc"]
+
+        self.metrics_to_log += ["r_loss", "r_acc"]
+
+    def _prepare_layers(self) -> None:
+        self.text_name = TEXT
+        self.label_name = TEXT if self.config[SHARE_HIDDEN_LAYERS] else LABEL
+
+        self._prepare_sequence_layers(self.text_name)
+        self._prepare_sequence_layers(self.label_name)
+        if self.config[MASKED_LM]:
+            self._prepare_mask_lm_layers(self.text_name)
+        self._prepare_label_classification_layers()
+
+    def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
+        all_label_ids = self.tf_label_data[LABEL_IDS][0]
+
+        mask_label = self.tf_label_data[LABEL_MASK][0]
+        sequence_lengths_label = self._get_sequence_lengths(mask_label)
+
+        label_transformed, _, _, _ = self._create_sequence(
+            self.tf_label_data[LABEL_FEATURES], mask_label, self.label_name
+        )
+        cls_label = self._last_token(label_transformed, sequence_lengths_label)
+
+        all_labels_embed = self._tf_layers[f"embed.{LABEL}"](cls_label)
+
+        return all_label_ids, all_labels_embed
+
+    def batch_loss(
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
+    ) -> tf.Tensor:
+        tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
+
+        mask_text = tf_batch_data[TEXT_MASK][0]
+        sequence_lengths_text = self._get_sequence_lengths(mask_text)
+
+        (
+            text_transformed,
+            text_in,
+            text_seq_ids,
+            lm_mask_bool_text,
+        ) = self._create_sequence(
+            tf_batch_data[TEXT_FEATURES],
+            mask_text,
+            self.text_name,
+            self.config[MASKED_LM],
+            sequence_ids=True,
+        )
+
+        mask_label = tf_batch_data[LABEL_MASK][0]
+        sequence_lengths_label = self._get_sequence_lengths(mask_label)
+
+        label_transformed, _, _, _ = self._create_sequence(
+            tf_batch_data[LABEL_FEATURES], mask_label, self.label_name
+        )
+
+        losses = []
+
+        if self.config[MASKED_LM]:
+            loss, acc = self._mask_loss(
+                text_transformed,
+                text_in,
+                text_seq_ids,
+                lm_mask_bool_text,
+                self.text_name,
+            )
+
+            self.mask_loss.update_state(loss)
+            self.mask_acc.update_state(acc)
+            losses.append(loss)
+
+        # get _cls_ vector for label classification
+        cls_text = self._last_token(text_transformed, sequence_lengths_text)
+        cls_label = self._last_token(label_transformed, sequence_lengths_label)
+        label_ids = tf_batch_data[LABEL_IDS][0]
+
+        loss, acc = self._calculate_label_loss(cls_text, cls_label, label_ids)
+        self.response_loss.update_state(loss)
+        self.response_acc.update_state(acc)
+        losses.append(loss)
+
+        return tf.math.add_n(losses)
+
+    def batch_predict(
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
+    ) -> Dict[Text, tf.Tensor]:
+        tf_batch_data = self.batch_to_model_data_format(
+            batch_in, self.predict_data_signature
+        )
+
+        mask_text = tf_batch_data[TEXT_MASK][0]
+        sequence_lengths_text = self._get_sequence_lengths(mask_text)
+
+        text_transformed, _, _, _ = self._create_sequence(
+            tf_batch_data[TEXT_FEATURES], mask_text, self.text_name
+        )
+
+        out = {}
+
+        if self.all_labels_embed is None:
+            _, self.all_labels_embed = self._create_all_labels()
+
+        # get _cls_ vector for intent classification
+        cls = self._last_token(text_transformed, sequence_lengths_text)
+        cls_embed = self._tf_layers[f"embed.{TEXT}"](cls)
+
+        sim_all = self._tf_layers[f"loss.{LABEL}"].sim(
+            cls_embed[:, tf.newaxis, :], self.all_labels_embed[tf.newaxis, :, :]
+        )
+        scores = self._tf_layers[f"loss.{LABEL}"].confidence_from_sim(
+            sim_all, self.config[SIMILARITY_TYPE]
+        )
+        out["i_scores"] = scores
+
+        return out
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index cc36edd08d0f..4a449f9e9b9c 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -25,8 +25,9 @@
     DEFAULT_OPEN_UTTERANCE_TYPE,
     RESPONSE_SELECTOR_PROPERTY_NAME,
     OPEN_UTTERANCE_PREDICTION_KEY,
-    EXTRACTOR_ATTRIBUTE,
+    EXTRACTOR,
     PRETRAINED_EXTRACTORS,
+    NO_ENTITY_TAG,
 )
 from rasa.model import get_model
 from rasa.nlu import config, training_data, utils
@@ -39,7 +40,13 @@
 
 logger = logging.getLogger(__name__)
 
-ENTITY_PROCESSORS = {"EntitySynonymMapper"}
+# Exclude 'EmbeddingIntentClassifier' and 'ResponseSelector' as their super class
+# performs entity extraction but those two classifiers don't
+ENTITY_PROCESSORS = {
+    "EntitySynonymMapper",
+    "EmbeddingIntentClassifier",
+    "ResponseSelector",
+}
 
 CVEvaluationResult = namedtuple("Results", "train test")
 
@@ -682,13 +689,15 @@ def evaluate_entities(
 
     aligned_predictions = align_all_entity_predictions(entity_results, extractors)
     merged_targets = merge_labels(aligned_predictions)
-    merged_targets = substitute_labels(merged_targets, "O", NO_ENTITY)
+    merged_targets = substitute_labels(merged_targets, NO_ENTITY_TAG, NO_ENTITY)
 
     result = {}
 
     for extractor in extractors:
         merged_predictions = merge_labels(aligned_predictions, extractor)
-        merged_predictions = substitute_labels(merged_predictions, "O", NO_ENTITY)
+        merged_predictions = substitute_labels(
+            merged_predictions, NO_ENTITY_TAG, NO_ENTITY
+        )
         logger.info(f"Evaluation for entity extractor: {extractor} ")
         if output_directory:
             report_filename = f"{extractor}_report.json"
@@ -815,7 +824,7 @@ def pick_best_entity_fit(token: Token, candidates: List[Dict]) -> Text:
     """
 
     if len(candidates) == 0:
-        return "O"
+        return NO_ENTITY_TAG
     elif len(candidates) == 1:
         return candidates[0]["entity"]
     else:
@@ -836,7 +845,7 @@ def determine_token_labels(
     """
 
     if entities is None or len(entities) == 0:
-        return "O"
+        return NO_ENTITY_TAG
     if not do_extractors_support_overlap(extractors) and do_entities_overlap(entities):
         raise ValueError("The possible entities should not overlap")
 
@@ -872,7 +881,7 @@ def align_entity_predictions(
         extractor: [] for extractor in extractors
     }
     for p in result.entity_predictions:
-        entities_by_extractors[p[EXTRACTOR_ATTRIBUTE]].append(p)
+        entities_by_extractors[p[EXTRACTOR]].append(p)
     extractor_labels: Dict[Text, List] = {extractor: [] for extractor in extractors}
     for t in result.tokens:
         true_token_labels.append(determine_token_labels(t, result.entity_targets, None))
@@ -994,43 +1003,56 @@ def get_eval_data(
 
 def get_entity_extractors(interpreter: Interpreter) -> Set[Text]:
     """Finds the names of entity extractors used by the interpreter.
-    Processors are removed since they do not
-    detect the boundaries themselves."""
 
-    extractors = {c.name for c in interpreter.pipeline if "entities" in c.provides}
+    Processors are removed since they do not detect the boundaries themselves.
+    """
+
+    from rasa.nlu.extractors.extractor import EntityExtractor
+
+    extractors = {
+        c.name for c in interpreter.pipeline if isinstance(c, EntityExtractor)
+    }
     return extractors - ENTITY_PROCESSORS
 
 
 def is_entity_extractor_present(interpreter: Interpreter) -> bool:
-    """Checks whether entity extractor is present"""
+    """Checks whether entity extractor is present."""
 
     extractors = get_entity_extractors(interpreter)
     return extractors != []
 
 
 def is_intent_classifier_present(interpreter: Interpreter) -> bool:
-    """Checks whether intent classifier is present"""
+    """Checks whether intent classifier is present."""
+
+    from rasa.nlu.classifiers.classifier import IntentClassifier
 
     intent_classifiers = [
-        c.name for c in interpreter.pipeline if "intent" in c.provides
+        c.name for c in interpreter.pipeline if isinstance(c, IntentClassifier)
     ]
     return intent_classifiers != []
 
 
 def is_response_selector_present(interpreter: Interpreter) -> bool:
-    """Checks whether response selector is present"""
+    """Checks whether response selector is present."""
+
+    from rasa.nlu.selectors.response_selector import ResponseSelector
 
     response_selectors = [
-        c.name for c in interpreter.pipeline if "response" in c.provides
+        c.name for c in interpreter.pipeline if isinstance(c, ResponseSelector)
     ]
     return response_selectors != []
 
 
 def get_available_response_selector_types(interpreter: Interpreter) -> List[Text]:
-    """Gets all available response selector types"""
+    """Gets all available response selector types."""
+
+    from rasa.nlu.selectors.response_selector import ResponseSelector
 
     response_selector_types = [
-        c.retrieval_intent for c in interpreter.pipeline if "response" in c.provides
+        c.retrieval_intent
+        for c in interpreter.pipeline
+        if isinstance(c, ResponseSelector)
     ]
 
     return response_selector_types
@@ -1433,8 +1455,6 @@ def compare_nlu(
         train, test = data.train_test_split()
         write_to_file(test_path, test.nlu_as_markdown())
 
-        training_examples_per_run = []
-
         for percentage in exclusion_percentages:
             percent_string = f"{percentage}%_exclusion"
 
@@ -1514,11 +1534,13 @@ def _compute_entity_metrics(
     aligned_predictions = align_all_entity_predictions(entity_results, extractors)
 
     merged_targets = merge_labels(aligned_predictions)
-    merged_targets = substitute_labels(merged_targets, "O", NO_ENTITY)
+    merged_targets = substitute_labels(merged_targets, NO_ENTITY_TAG, NO_ENTITY)
 
     for extractor in extractors:
         merged_predictions = merge_labels(aligned_predictions, extractor)
-        merged_predictions = substitute_labels(merged_predictions, "O", NO_ENTITY)
+        merged_predictions = substitute_labels(
+            merged_predictions, NO_ENTITY_TAG, NO_ENTITY
+        )
         _, precision, f1, accuracy = get_evaluation_metrics(
             merged_targets, merged_predictions, exclude_label=NO_ENTITY
         )
diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py
index 8b61624c1c6a..5727a258641f 100644
--- a/rasa/nlu/tokenizers/convert_tokenizer.py
+++ b/rasa/nlu/tokenizers/convert_tokenizer.py
@@ -4,12 +4,17 @@
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 from rasa.nlu.training_data import Message
 from rasa.nlu.constants import MESSAGE_ATTRIBUTES, TOKENS_NAMES
+import rasa.utils.train_utils as train_utils
 import tensorflow as tf
 
 
 class ConveRTTokenizer(WhitespaceTokenizer):
+    """Tokenizer using ConveRT model.
 
-    provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
+    Loads the ConveRT(https://github.com/PolyAI-LDN/polyai-models#convert)
+    model from TFHub and computes sub-word tokens for dense
+    featurizable attributes of each message object.
+    """
 
     defaults = {
         # Flag to check whether to split intents
@@ -25,31 +30,16 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
 
         super().__init__(component_config)
 
-        self._load_tokenizer_params()
-
-    def _load_tokenizer_params(self):
-
-        # needed to load the ConveRT model
-        import tensorflow_text
-        import tensorflow_hub as tfhub
-
-        self.graph = tf.Graph()
         model_url = "http://models.poly-ai.com/convert/v1/model.tar.gz"
+        self.module = train_utils.load_tf_hub_model(model_url)
 
-        with self.graph.as_default():
-            self.session = tf.Session()
-            self.module = tfhub.Module(model_url)
-
-            self.text_placeholder = tf.placeholder(dtype=tf.string, shape=[None])
-            self.tokenized = self.module(self.text_placeholder, signature="tokenize")
-
-            self.session.run(tf.tables_initializer())
-            self.session.run(tf.global_variables_initializer())
+        self.tokenize_signature = self.module.signatures["tokenize"]
 
     def _tokenize(self, sentence: Text) -> Any:
-        return self.session.run(
-            self.tokenized, feed_dict={self.text_placeholder: [sentence]}
-        )
+
+        return self.tokenize_signature(tf.convert_to_tensor([sentence]))[
+            "default"
+        ].numpy()
 
     def tokenize(self, message: Message, attribute: Text) -> List[Token]:
         """Tokenize the text using the ConveRT model.
@@ -75,10 +65,9 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]:
             # clean tokens (remove special chars and empty tokens)
             split_token_strings = self._clean_tokens(split_token_strings)
 
-            _aligned_tokens = self._align_tokens(
+            tokens_out += train_utils.align_tokens(
                 split_token_strings, token_end, token_start
             )
-            tokens_out += _aligned_tokens
 
         return tokens_out
 
@@ -87,38 +76,3 @@ def _clean_tokens(self, tokens: List[bytes]):
 
         tokens = [string.decode("utf-8").replace("﹏", "") for string in tokens]
         return [string for string in tokens if string]
-
-    def _align_tokens(self, tokens_in: List[Text], token_end: int, token_start: int):
-        """Align sub-tokens of ConveRT with tokens return by the WhitespaceTokenizer.
-
-        As ConveRT might split a single word into multiple tokens, we need to make
-        sure that the start and end value of first and last sub-token matches the
-        start and end value of the token return by the WhitespaceTokenizer as the
-        entities are using those start and end values.
-        """
-
-        tokens_out = []
-
-        current_token_offset = token_start
-
-        for index, string in enumerate(tokens_in):
-            if index == 0:
-                if index == len(tokens_in) - 1:
-                    s_token_end = token_end
-                else:
-                    s_token_end = current_token_offset + len(string)
-                tokens_out.append(Token(string, token_start, end=s_token_end))
-            elif index == len(tokens_in) - 1:
-                tokens_out.append(Token(string, current_token_offset, end=token_end))
-            else:
-                tokens_out.append(
-                    Token(
-                        string,
-                        current_token_offset,
-                        end=current_token_offset + len(string),
-                    )
-                )
-
-            current_token_offset += len(string)
-
-        return tokens_out
diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index 6cf2af11f45e..59dd9425a404 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -20,8 +20,6 @@
 
 class JiebaTokenizer(Tokenizer):
 
-    provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
-
     language_list = ["zh"]
 
     defaults = {
diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py
new file mode 100644
index 000000000000..56ac683ddf60
--- /dev/null
+++ b/rasa/nlu/tokenizers/lm_tokenizer.py
@@ -0,0 +1,38 @@
+from typing import Text, List, Any, Dict, Type
+
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
+from rasa.nlu.components import Component
+from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
+from rasa.nlu.training_data import Message
+
+from rasa.nlu.constants import (
+    LANGUAGE_MODEL_DOCS,
+    TOKENS,
+)
+
+
+class LanguageModelTokenizer(Tokenizer):
+    """Tokenizer using transformer based language models.
+
+    Uses the output of HFTransformersNLP component to set the tokens
+    for dense featurizable attributes of each message object.
+    """
+
+    @classmethod
+    def required_components(cls) -> List[Type[Component]]:
+        return [HFTransformersNLP]
+
+    defaults = {
+        # Flag to check whether to split intents
+        "intent_tokenization_flag": False,
+        # Symbol on which intent should be split
+        "intent_split_symbol": "_",
+    }
+
+    def get_doc(self, message: Message, attribute: Text) -> Dict[Text, Any]:
+        return message.get(LANGUAGE_MODEL_DOCS[attribute])
+
+    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
+        doc = self.get_doc(message, attribute)
+
+        return doc[TOKENS]
diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index 1a9e4d34c980..054e3225fb10 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -9,8 +9,6 @@
 
 class MitieTokenizer(Tokenizer):
 
-    provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
-
     defaults = {
         # Flag to check whether to split intents
         "intent_tokenization_flag": False,
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 9a66cb6522e9..58368b48aaf7 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -1,20 +1,24 @@
 import typing
-from typing import Text, List
+from typing import Text, List, Any, Type
 
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
+from rasa.nlu.components import Component
+from rasa.nlu.utils.spacy_utils import SpacyNLP
 from rasa.nlu.training_data import Message
 
-from rasa.nlu.constants import TOKENS_NAMES, SPACY_DOCS, DENSE_FEATURIZABLE_ATTRIBUTES
+from rasa.nlu.constants import SPACY_DOCS
 
 if typing.TYPE_CHECKING:
     from spacy.tokens.doc import Doc  # pytype: disable=import-error
 
 
-class SpacyTokenizer(Tokenizer):
+POS_TAG_KEY = "pos"
 
-    provides = [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
 
-    requires = [SPACY_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
+class SpacyTokenizer(Tokenizer):
+    @classmethod
+    def required_components(cls) -> List[Type[Component]]:
+        return [SpacyNLP]
 
     defaults = {
         # Flag to check whether to split intents
@@ -29,4 +33,18 @@ def get_doc(self, message: Message, attribute: Text) -> "Doc":
     def tokenize(self, message: Message, attribute: Text) -> List[Token]:
         doc = self.get_doc(message, attribute)
 
-        return [Token(t.text, t.idx, lemma=t.lemma_) for t in doc]
+        return [
+            Token(
+                t.text, t.idx, lemma=t.lemma_, data={POS_TAG_KEY: self._tag_of_token(t)}
+            )
+            for t in doc
+        ]
+
+    @staticmethod
+    def _tag_of_token(token: Any) -> Text:
+        import spacy
+
+        if spacy.about.__version__ > "2" and token._.has("tag"):
+            return token._.get("tag")
+        else:
+            return token.tag_
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index 79016f66e59c..26f96d459395 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -6,12 +6,12 @@
 from rasa.nlu.training_data import TrainingData, Message
 from rasa.nlu.components import Component
 from rasa.nlu.constants import (
-    RESPONSE_ATTRIBUTE,
-    TEXT_ATTRIBUTE,
+    RESPONSE,
+    TEXT,
     CLS_TOKEN,
     TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    INTENT_ATTRIBUTE,
+    INTENT,
 )
 
 logger = logging.getLogger(__name__)
@@ -22,16 +22,16 @@ def __init__(
         self,
         text: Text,
         start: int,
+        end: Optional[int] = None,
         data: Optional[Dict[Text, Any]] = None,
         lemma: Optional[Text] = None,
-        end: Optional[int] = None,
     ) -> None:
-        self.start = start
         self.text = text
-        self.end = start + len(text)
+        self.start = start
+        self.end = end if end else start + len(text)
+
         self.data = data if data else {}
         self.lemma = lemma or text
-        self.end = end if end else start + len(text)
 
     def set(self, prop: Text, info: Any) -> None:
         self.data[prop] = info
@@ -89,7 +89,7 @@ def train(
         for example in training_data.training_examples:
             for attribute in MESSAGE_ATTRIBUTES:
                 if example.get(attribute) is not None:
-                    if attribute == INTENT_ATTRIBUTE:
+                    if attribute == INTENT:
                         tokens = self._split_intent(example)
                     else:
                         tokens = self.tokenize(example, attribute)
@@ -99,12 +99,12 @@ def train(
     def process(self, message: Message, **kwargs: Any) -> None:
         """Tokenize the incoming message."""
 
-        tokens = self.tokenize(message, TEXT_ATTRIBUTE)
-        tokens = self.add_cls_token(tokens, TEXT_ATTRIBUTE)
-        message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens)
+        tokens = self.tokenize(message, TEXT)
+        tokens = self.add_cls_token(tokens, TEXT)
+        message.set(TOKENS_NAMES[TEXT], tokens)
 
     def _split_intent(self, message: Message):
-        text = message.get(INTENT_ATTRIBUTE)
+        text = message.get(INTENT)
 
         words = (
             text.split(self.intent_split_symbol)
@@ -129,7 +129,7 @@ def _convert_words_to_tokens(words: List[Text], text: Text) -> List[Token]:
 
     @staticmethod
     def add_cls_token(tokens: List[Token], attribute: Text) -> List[Token]:
-        if attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE] and tokens:
+        if attribute in [RESPONSE, TEXT] and tokens:
             # +1 to have a space between the last token and the __cls__ token
             idx = tokens[-1].end + 1
             tokens.append(Token(CLS_TOKEN, idx))
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 4b7c7253d2bd..85ad4d07bf0d 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -8,8 +8,6 @@
 
 class WhitespaceTokenizer(Tokenizer):
 
-    provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
-
     defaults = {
         # Flag to check whether to split intents
         "intent_tokenization_flag": False,
diff --git a/rasa/nlu/training_data/formats/markdown.py b/rasa/nlu/training_data/formats/markdown.py
index 3b27facce2cb..95c630665421 100644
--- a/rasa/nlu/training_data/formats/markdown.py
+++ b/rasa/nlu/training_data/formats/markdown.py
@@ -11,7 +11,7 @@
     TrainingDataWriter,
 )
 from rasa.nlu.utils import build_entity
-from rasa.nlu.constants import INTENT_ATTRIBUTE
+from rasa.nlu.constants import INTENT
 
 
 if typing.TYPE_CHECKING:
@@ -218,7 +218,7 @@ def _generate_training_examples_md(self, training_data: "TrainingData") -> Text:
         # Sort by intent while keeping basic intent order
         for example in [e.as_dict_nlu() for e in training_data.training_examples]:
             rasa_nlu_training_data_utils.remove_untrainable_entities_from(example)
-            intent = example[INTENT_ATTRIBUTE]
+            intent = example[INTENT]
             training_examples.setdefault(intent, [])
             training_examples[intent].append(example)
 
diff --git a/rasa/nlu/training_data/message.py b/rasa/nlu/training_data/message.py
index 7a661a748923..c161c1fa01ff 100644
--- a/rasa/nlu/training_data/message.py
+++ b/rasa/nlu/training_data/message.py
@@ -1,11 +1,11 @@
 from typing import Any, Optional, Tuple, Text
 
 from rasa.nlu.constants import (
-    ENTITIES_ATTRIBUTE,
-    INTENT_ATTRIBUTE,
-    RESPONSE_ATTRIBUTE,
+    ENTITIES,
+    INTENT,
+    RESPONSE,
     RESPONSE_KEY_ATTRIBUTE,
-    TEXT_ATTRIBUTE,
+    TEXT,
     RESPONSE_IDENTIFIER_DELIMITER,
 )
 from rasa.nlu.utils import ordered
@@ -30,7 +30,7 @@ def set(self, prop, info, add_to_output=False) -> None:
             self.output_properties.add(prop)
 
     def get(self, prop, default=None) -> Any:
-        if prop == TEXT_ATTRIBUTE:
+        if prop == TEXT:
             return self.text
         return self.data.get(prop, default)
 
@@ -38,10 +38,10 @@ def as_dict_nlu(self) -> dict:
         """Get dict representation of message as it would appear in training data"""
 
         d = self.as_dict()
-        if d.get(INTENT_ATTRIBUTE, None):
-            d[INTENT_ATTRIBUTE] = self.get_combined_intent_response_key()
+        if d.get(INTENT, None):
+            d[INTENT] = self.get_combined_intent_response_key()
         d.pop(RESPONSE_KEY_ATTRIBUTE, None)
-        d.pop(RESPONSE_ATTRIBUTE, None)
+        d.pop(RESPONSE, None)
         return d
 
     def as_dict(self, only_output_properties=False) -> dict:
@@ -73,17 +73,17 @@ def build(cls, text, intent=None, entities=None) -> "Message":
         data = {}
         if intent:
             split_intent, response_key = cls.separate_intent_response_key(intent)
-            data[INTENT_ATTRIBUTE] = split_intent
+            data[INTENT] = split_intent
             if response_key:
                 data[RESPONSE_KEY_ATTRIBUTE] = response_key
         if entities:
-            data[ENTITIES_ATTRIBUTE] = entities
+            data[ENTITIES] = entities
         return cls(text, data)
 
     def get_combined_intent_response_key(self) -> Text:
         """Get intent as it appears in training data"""
 
-        intent = self.get(INTENT_ATTRIBUTE)
+        intent = self.get(INTENT)
         response_key = self.get(RESPONSE_KEY_ATTRIBUTE)
         response_key_suffix = (
             f"{RESPONSE_IDENTIFIER_DELIMITER}{response_key}" if response_key else ""
diff --git a/rasa/nlu/training_data/training_data.py b/rasa/nlu/training_data/training_data.py
index e542734ea1f0..e3e97d126820 100644
--- a/rasa/nlu/training_data/training_data.py
+++ b/rasa/nlu/training_data/training_data.py
@@ -8,7 +8,7 @@
 
 import rasa.nlu.utils
 from rasa.utils.common import raise_warning, lazy_property
-from rasa.nlu.constants import RESPONSE_ATTRIBUTE, RESPONSE_KEY_ATTRIBUTE
+from rasa.nlu.constants import RESPONSE, RESPONSE_KEY_ATTRIBUTE
 from rasa.nlu.training_data.message import Message
 from rasa.nlu.training_data.util import check_duplicate_synonym
 from rasa.nlu.utils import list_to_str
@@ -186,7 +186,7 @@ def fill_response_phrases(self) -> None:
                 assistant_utterances = self.nlg_stories.get(story_lookup_intent, [])
                 if assistant_utterances:
                     # selecting only first assistant utterance for now
-                    example.set(RESPONSE_ATTRIBUTE, assistant_utterances[0])
+                    example.set(RESPONSE, assistant_utterances[0])
                 else:
                     raise ValueError(
                         "No response phrases found for {}. Check training data "
@@ -384,10 +384,8 @@ def build_nlg_stories_from_examples(examples) -> Dict[Text, list]:
 
         nlg_stories = {}
         for ex in examples:
-            if ex.get(RESPONSE_KEY_ATTRIBUTE) and ex.get(RESPONSE_ATTRIBUTE):
-                nlg_stories[ex.get_combined_intent_response_key()] = [
-                    ex.get(RESPONSE_ATTRIBUTE)
-                ]
+            if ex.get(RESPONSE_KEY_ATTRIBUTE) and ex.get(RESPONSE):
+                nlg_stories[ex.get_combined_intent_response_key()] = [ex.get(RESPONSE)]
         return nlg_stories
 
     def split_nlu_examples(
diff --git a/rasa/nlu/training_data/util.py b/rasa/nlu/training_data/util.py
index 1294434518c7..bb189a3bb4a2 100644
--- a/rasa/nlu/training_data/util.py
+++ b/rasa/nlu/training_data/util.py
@@ -5,8 +5,8 @@
 
 import rasa.utils.io as io_utils
 from rasa.nlu.constants import (
-    ENTITIES_ATTRIBUTE,
-    EXTRACTOR_ATTRIBUTE,
+    ENTITIES,
+    EXTRACTOR,
     PRETRAINED_EXTRACTORS,
 )
 from rasa.utils.common import raise_warning
@@ -67,7 +67,7 @@ def remove_untrainable_entities_from(example: Dict[Text, Any]) -> None:
         example: Serialised training example to inspect.
     """
 
-    example_entities = example.get(ENTITIES_ATTRIBUTE)
+    example_entities = example.get(ENTITIES)
 
     if not example_entities:
         # example contains no entities, so there's nothing to do
@@ -76,7 +76,7 @@ def remove_untrainable_entities_from(example: Dict[Text, Any]) -> None:
     trainable_entities = []
 
     for entity in example_entities:
-        if entity.get(EXTRACTOR_ATTRIBUTE) in PRETRAINED_EXTRACTORS:
+        if entity.get(EXTRACTOR) in PRETRAINED_EXTRACTORS:
             logger.debug(
                 f"Excluding entity '{json.dumps(entity)}' from training data. "
                 f"Entity examples extracted by the following classes are not "
@@ -86,4 +86,4 @@ def remove_untrainable_entities_from(example: Dict[Text, Any]) -> None:
         else:
             trainable_entities.append(entity)
 
-    example[ENTITIES_ATTRIBUTE] = trainable_entities
+    example[ENTITIES] = trainable_entities
diff --git a/rasa/nlu/utils/__init__.py b/rasa/nlu/utils/__init__.py
index 21dbba149f48..528f990cc09a 100644
--- a/rasa/nlu/utils/__init__.py
+++ b/rasa/nlu/utils/__init__.py
@@ -104,24 +104,3 @@ def remove_model(model_dir: Text) -> bool:
             "Cannot remove {}, it seems it is not a model "
             "directory".format(model_dir)
         )
-
-
-def json_unpickle(file_name: Text) -> Any:
-    """Unpickle an object from file using json."""
-    import jsonpickle.ext.numpy as jsonpickle_numpy
-    import jsonpickle
-
-    jsonpickle_numpy.register_handlers()
-
-    file_content = io_utils.read_file(file_name)
-    return jsonpickle.loads(file_content)
-
-
-def json_pickle(file_name: Text, obj: Any) -> None:
-    """Pickle an object to a file using json."""
-    import jsonpickle.ext.numpy as jsonpickle_numpy
-    import jsonpickle
-
-    jsonpickle_numpy.register_handlers()
-
-    io_utils.write_text_file(jsonpickle.dumps(obj), file_name)
diff --git a/rasa/nlu/utils/bilou_utils.py b/rasa/nlu/utils/bilou_utils.py
new file mode 100644
index 000000000000..335c3fe0ece3
--- /dev/null
+++ b/rasa/nlu/utils/bilou_utils.py
@@ -0,0 +1,208 @@
+from typing import List, Tuple, Text, Optional, Dict, Set, Any
+
+from rasa.nlu.tokenizers.tokenizer import Token
+from rasa.nlu.training_data import Message
+from rasa.nlu.training_data import TrainingData
+from rasa.nlu.constants import (
+    ENTITIES,
+    TOKENS_NAMES,
+    TEXT,
+    BILOU_ENTITIES,
+    NO_ENTITY_TAG,
+)
+
+BILOU_PREFIXES = ["B-", "I-", "U-", "L-"]
+
+
+def bilou_prefix_from_tag(tag: Text) -> Optional[Text]:
+    """Returns the BILOU prefix from the given tag.
+
+    Args:
+        tag: the tag
+
+    Returns: the BILOU prefix of the tag
+    """
+    if tag[:2] in BILOU_PREFIXES:
+        return tag[0]
+    return None
+
+
+def entity_name_from_tag(tag: Text) -> Text:
+    """Remove the BILOU prefix from the given tag.
+
+    Args:
+        tag: the tag
+
+    Returns: the tag without the BILOU prefix
+    """
+    if tag[:2] in BILOU_PREFIXES:
+        return tag[2:]
+    return tag
+
+
+def tags_to_ids(message: Message, tag_id_dict: Dict[Text, int]) -> List[int]:
+    """Maps the entity tags of the message to the ids of the provided dict.
+
+    Args:
+        message: the message
+        tag_id_dict: mapping of tags to ids
+
+    Returns: a list of tag ids
+    """
+    if message.get(BILOU_ENTITIES):
+        _tags = [
+            tag_id_dict[_tag] if _tag in tag_id_dict else tag_id_dict[NO_ENTITY_TAG]
+            for _tag in message.get(BILOU_ENTITIES)
+        ]
+    else:
+        _tags = [tag_id_dict[NO_ENTITY_TAG] for _ in message.get(TOKENS_NAMES[TEXT])]
+
+    return _tags
+
+
+def remove_bilou_prefixes(tags: List[Text]) -> List[Text]:
+    """Removes the BILOU prefixes from the given list of tags.
+
+    Args:
+        tags: the list of tags
+
+    Returns: list of tags without BILOU prefix
+    """
+    return [entity_name_from_tag(t) for t in tags]
+
+
+def build_tag_id_dict(training_data: TrainingData) -> Dict[Text, int]:
+    """Create a mapping of unique tags to ids.
+
+    Args:
+        training_data: the training data
+
+    Returns: a mapping of tags to ids
+    """
+    distinct_tags = set(
+        [
+            entity_name_from_tag(e)
+            for example in training_data.training_examples
+            if example.get(BILOU_ENTITIES)
+            for e in example.get(BILOU_ENTITIES)
+        ]
+    ) - {NO_ENTITY_TAG}
+
+    tag_id_dict = {
+        f"{prefix}{tag}": idx_1 * len(BILOU_PREFIXES) + idx_2 + 1
+        for idx_1, tag in enumerate(sorted(distinct_tags))
+        for idx_2, prefix in enumerate(BILOU_PREFIXES)
+    }
+    # NO_ENTITY_TAG corresponds to non-entity which should correspond to 0 index
+    # needed for correct prediction for padding
+    tag_id_dict[NO_ENTITY_TAG] = 0
+
+    return tag_id_dict
+
+
+def apply_bilou_schema(training_data: TrainingData) -> None:
+    """Gets a list of BILOU entity tags and sets them on the given messages.
+
+    Args:
+        training_data: the training data
+    """
+    for message in training_data.training_examples:
+        entities = message.get(ENTITIES)
+
+        if not entities:
+            continue
+
+        entities = map_message_entities(message)
+        output = bilou_tags_from_offsets(message.get(TOKENS_NAMES[TEXT]), entities)
+
+        message.set(BILOU_ENTITIES, output)
+
+
+def map_message_entities(message: Message) -> List[Tuple[int, int, Text]]:
+    """Maps the entities of the given message to their start, end, and tag values.
+
+    Args:
+        message: the message
+
+    Returns: a list of start, end, and tag value tuples
+    """
+
+    def convert_entity(entity: Dict[Text, Any]) -> Tuple[int, int, Text]:
+        return entity["start"], entity["end"], entity["entity"]
+
+    return [convert_entity(entity) for entity in message.get(ENTITIES, [])]
+
+
+def bilou_tags_from_offsets(
+    tokens: List[Token],
+    entities: List[Tuple[int, int, Text]],
+    missing: Text = NO_ENTITY_TAG,
+) -> List[Text]:
+    """Creates a list of BILOU tags for the given list of tokens and entities.
+
+    Args:
+        tokens: the list of tokens
+        entities: the list of start, end, and tag tuples
+        missing: tag for missing entities
+
+    Returns: a list of BILOU tags
+    """
+    # From spacy.spacy.GoldParse, under MIT License
+
+    start_pos_to_token_idx = {token.start: i for i, token in enumerate(tokens)}
+    end_pos_to_token_idx = {token.end: i for i, token in enumerate(tokens)}
+
+    bilou = ["-" for _ in tokens]
+
+    # Handle entity cases
+    _add_bilou_tags_to_entities(
+        bilou, entities, end_pos_to_token_idx, start_pos_to_token_idx
+    )
+
+    # Now distinguish the O cases from ones where we miss the tokenization
+    entity_positions = _get_entity_positions(entities)
+    _handle_not_an_entity(bilou, tokens, entity_positions, missing)
+
+    return bilou
+
+
+def _add_bilou_tags_to_entities(
+    bilou: List[Text],
+    entities: List[Tuple[int, int, Text]],
+    end_pos_to_token_idx: Dict[int, int],
+    start_pos_to_token_idx: Dict[int, int],
+):
+    for start_pos, end_pos, label in entities:
+        start_token_idx = start_pos_to_token_idx.get(start_pos)
+        end_token_idx = end_pos_to_token_idx.get(end_pos)
+
+        # Only interested if the tokenization is correct
+        if start_token_idx is not None and end_token_idx is not None:
+            if start_token_idx == end_token_idx:
+                bilou[start_token_idx] = f"U-{label}"
+            else:
+                bilou[start_token_idx] = f"B-{label}"
+                for i in range(start_token_idx + 1, end_token_idx):
+                    bilou[i] = f"I-{label}"
+                bilou[end_token_idx] = f"L-{label}"
+
+
+def _get_entity_positions(entities: List[Tuple[int, int, Text]]) -> Set[int]:
+    entity_positions = set()
+
+    for start_pos, end_pos, label in entities:
+        for i in range(start_pos, end_pos):
+            entity_positions.add(i)
+
+    return entity_positions
+
+
+def _handle_not_an_entity(
+    bilou: List[Text], tokens: List[Token], entity_positions: Set[int], missing: Text
+):
+    for n, token in enumerate(tokens):
+        for i in range(token.start, token.end):
+            if i in entity_positions:
+                break
+        else:
+            bilou[n] = missing
diff --git a/tests/nlu/base/__init__.py b/rasa/nlu/utils/hugging_face/__init__.py
similarity index 100%
rename from tests/nlu/base/__init__.py
rename to rasa/nlu/utils/hugging_face/__init__.py
diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
new file mode 100644
index 000000000000..0ae86b3acbf6
--- /dev/null
+++ b/rasa/nlu/utils/hugging_face/hf_transformers.py
@@ -0,0 +1,349 @@
+import logging
+from typing import Any, Dict, List, Text, Tuple, Optional
+
+from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+from rasa.nlu.components import Component
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.training_data import Message, TrainingData
+from rasa.nlu.tokenizers.tokenizer import Token
+import rasa.utils.train_utils as train_utils
+import numpy as np
+
+from rasa.nlu.constants import (
+    TEXT,
+    LANGUAGE_MODEL_DOCS,
+    DENSE_FEATURIZABLE_ATTRIBUTES,
+    TOKEN_IDS,
+    TOKENS,
+    SENTENCE_FEATURES,
+    SEQUENCE_FEATURES,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class HFTransformersNLP(Component):
+    """Utility Component for interfacing between Transformers library.
+
+    The transformers(https://github.com/huggingface/transformers) library
+    is used to load pre-trained language models like BERT, GPT-2, etc.
+    The component also tokenizes and featurizes dense featurizable attributes of each
+    message.
+    """
+
+    defaults = {
+        # name of the language model to load.
+        "model_name": "bert",
+        # Pre-Trained weights to be loaded(string)
+        "model_weights": None,
+    }
+
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
+        super(HFTransformersNLP, self).__init__(component_config)
+
+        self._load_model()
+        self.whitespace_tokenizer = WhitespaceTokenizer()
+
+    def _load_model(self) -> None:
+        """Try loading the model"""
+
+        from rasa.nlu.utils.hugging_face.registry import (
+            model_class_dict,
+            model_weights_defaults,
+            model_tokenizer_dict,
+        )
+
+        self.model_name = self.component_config["model_name"]
+
+        if self.model_name not in model_class_dict:
+            raise KeyError(
+                f"'{self.model_name}' not a valid model name. Choose from "
+                f"{str(list(model_class_dict.keys()))}or create"
+                f"a new class inheriting from this class to support your model."
+            )
+
+        self.model_weights = self.component_config["model_weights"]
+
+        if not self.model_weights:
+            logger.info(
+                f"Model weights not specified. Will choose default model weights: "
+                f"{model_weights_defaults[self.model_name]}"
+            )
+            self.model_weights = model_weights_defaults[self.model_name]
+
+        logger.debug(f"Loading Tokenizer and Model for {self.model_name}")
+
+        self.tokenizer = model_tokenizer_dict[self.model_name].from_pretrained(
+            self.model_weights
+        )
+        self.model = model_class_dict[self.model_name].from_pretrained(
+            self.model_weights
+        )
+
+        # Use a universal pad token since all transformer architectures do not have a
+        # consistent token. Instead of pad_token_id we use unk_token_id because
+        # pad_token_id is not set for all architectures. We can't add a new token as
+        # well since vocabulary resizing is not yet supported for TF classes.
+        # Also, this does not hurt the model predictions since we use an attention mask
+        # while feeding input.
+        self.pad_token_id = self.tokenizer.unk_token_id
+
+    @classmethod
+    def required_packages(cls) -> List[Text]:
+        return ["transformers"]
+
+    def _lm_tokenize(self, text: Text) -> Tuple[List[int], List[Text]]:
+        split_token_ids = self.tokenizer.encode(text, add_special_tokens=False)
+
+        split_token_strings = self.tokenizer.convert_ids_to_tokens(split_token_ids)
+
+        return split_token_ids, split_token_strings
+
+    def _add_lm_specific_special_tokens(
+        self, token_ids: List[List[int]]
+    ) -> List[List[int]]:
+        from rasa.nlu.utils.hugging_face.registry import (
+            model_special_tokens_pre_processors,
+        )
+
+        augmented_tokens = [
+            model_special_tokens_pre_processors[self.model_name](example_token_ids)
+            for example_token_ids in token_ids
+        ]
+        return augmented_tokens
+
+    def _lm_specific_token_cleanup(self, token_strings: List[Text]) -> List[Text]:
+        from rasa.nlu.utils.hugging_face.registry import model_tokens_cleaners
+
+        return model_tokens_cleaners[self.model_name](token_strings)
+
+    def _post_process_sequence_embeddings(
+        self, sequence_embeddings: np.ndarray
+    ) -> Tuple[np.ndarray, np.ndarray]:
+
+        from rasa.nlu.utils.hugging_face.registry import (
+            model_embeddings_post_processors,
+        )
+
+        sentence_embeddings = []
+        post_processed_sequence_embeddings = []
+
+        for example_embedding in sequence_embeddings:
+            (
+                example_sentence_embedding,
+                example_post_processed_embedding,
+            ) = model_embeddings_post_processors[self.model_name](example_embedding)
+
+            sentence_embeddings.append(example_sentence_embedding)
+            post_processed_sequence_embeddings.append(example_post_processed_embedding)
+
+        return (
+            np.array(sentence_embeddings),
+            np.array(post_processed_sequence_embeddings),
+        )
+
+    def _tokenize_example(
+        self, message: Message, attribute: Text
+    ) -> Tuple[List[Token], List[int]]:
+
+        tokens_in = self.whitespace_tokenizer.tokenize(message, attribute)
+
+        tokens_out = []
+
+        token_ids_out = []
+
+        for token in tokens_in:
+            # use lm specific tokenizer to further tokenize the text
+            split_token_ids, split_token_strings = self._lm_tokenize(token.text)
+
+            split_token_strings = self._lm_specific_token_cleanup(split_token_strings)
+
+            token_ids_out += split_token_ids
+
+            tokens_out += train_utils.align_tokens(
+                split_token_strings, token.end, token.start
+            )
+
+        return tokens_out, token_ids_out
+
+    def _get_token_ids_for_batch(
+        self, batch_examples: List[Message], attribute: Text
+    ) -> Tuple[List[List[Token]], List[List[int]]]:
+
+        batch_token_ids = []
+        batch_tokens = []
+        for example in batch_examples:
+
+            example_tokens, example_token_ids = self._tokenize_example(
+                example, attribute
+            )
+            batch_tokens.append(example_tokens)
+            batch_token_ids.append(example_token_ids)
+
+        return batch_tokens, batch_token_ids
+
+    @staticmethod
+    def _compute_attention_mask(actual_sequence_lengths: List[int]) -> np.ndarray:
+
+        attention_mask = []
+        max_seq_length = max(actual_sequence_lengths)
+        for actual_sequence_length in actual_sequence_lengths:
+            # add 1s for present tokens, fill up the remaining space up to max
+            # sequence length with 0s (non-existing tokens)
+            padded_sequence = [1] * actual_sequence_length + [0] * (
+                max_seq_length - actual_sequence_length
+            )
+            attention_mask.append(padded_sequence)
+
+        attention_mask = np.array(attention_mask).astype(np.float32)
+
+        return attention_mask
+
+    def _add_padding_to_batch(
+        self, batch_token_ids: List[List[int]]
+    ) -> Tuple[List[int], List[List[int]]]:
+        padded_token_ids = []
+        # Compute max length across examples
+        max_seq_len = 0
+        actual_sequence_lengths = []
+
+        for example_token_ids in batch_token_ids:
+            actual_sequence_lengths.append(len(example_token_ids))
+            max_seq_len = max(max_seq_len, len(example_token_ids))
+
+        # Add padding according to max_seq_len
+        # Some models don't contain pad token, we use unknown token as padding token.
+        # This doesn't affect the computation since we compute an attention mask
+        # anyways.
+        for example_token_ids in batch_token_ids:
+            padded_token_ids.append(
+                example_token_ids
+                + [self.pad_token_id] * (max_seq_len - len(example_token_ids))
+            )
+        return actual_sequence_lengths, padded_token_ids
+
+    @staticmethod
+    def _extract_nonpadded_embeddings(
+        embeddings: np.ndarray, actual_sequence_lengths: List[int]
+    ) -> np.ndarray:
+        nonpadded_sequence_embeddings = []
+        for index, embedding in enumerate(embeddings):
+            unmasked_embedding = embedding[: actual_sequence_lengths[index]]
+            nonpadded_sequence_embeddings.append(unmasked_embedding)
+
+        return np.array(nonpadded_sequence_embeddings)
+
+    def _compute_batch_sequence_features(
+        self, batch_attention_mask: np.ndarray, padded_token_ids: List[List[int]]
+    ) -> np.ndarray:
+        model_outputs = self.model(
+            np.array(padded_token_ids), attention_mask=np.array(batch_attention_mask)
+        )
+
+        # sequence hidden states is always the first output from all models
+        sequence_hidden_states = model_outputs[0]
+
+        sequence_hidden_states = sequence_hidden_states.numpy()
+        return sequence_hidden_states
+
+    def _get_model_features_for_batch(
+        self, batch_token_ids: List[List[int]]
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        # Let's first add tokenizer specific special tokens to all examples
+        batch_token_ids_augmented = self._add_lm_specific_special_tokens(
+            batch_token_ids
+        )
+
+        # Let's first add padding so that whole batch can be fed to the model
+        actual_sequence_lengths, padded_token_ids = self._add_padding_to_batch(
+            batch_token_ids_augmented
+        )
+
+        # Compute attention mask based on actual_sequence_length
+        batch_attention_mask = self._compute_attention_mask(actual_sequence_lengths)
+
+        # Get token level features from the model
+        sequence_hidden_states = self._compute_batch_sequence_features(
+            batch_attention_mask, padded_token_ids
+        )
+
+        # Extract features for only non-padding tokens
+        sequence_nonpadded_embeddings = self._extract_nonpadded_embeddings(
+            sequence_hidden_states, actual_sequence_lengths
+        )
+
+        # Extract sentence level and post-processed features
+        (
+            sentence_embeddings,
+            sequence_final_embeddings,
+        ) = self._post_process_sequence_embeddings(sequence_nonpadded_embeddings)
+
+        return sentence_embeddings, sequence_final_embeddings
+
+    def _get_docs_for_batch(
+        self, batch_examples: List[Message], attribute: Text
+    ) -> List[Dict[Text, Any]]:
+
+        batch_tokens, batch_token_ids = self._get_token_ids_for_batch(
+            batch_examples, attribute
+        )
+
+        (
+            batch_sentence_features,
+            batch_sequence_features,
+        ) = self._get_model_features_for_batch(batch_token_ids)
+
+        # A doc consists of
+        # {'token_ids': ..., 'tokens': ..., 'sequence_features': ..., 'sentence_features': ...}
+        batch_docs = []
+        for index in range(len(batch_examples)):
+            doc = {
+                TOKEN_IDS: batch_token_ids[index],
+                TOKENS: batch_tokens[index],
+                SEQUENCE_FEATURES: batch_sequence_features[index],
+                SENTENCE_FEATURES: np.reshape(batch_sentence_features[index], (1, -1)),
+            }
+            batch_docs.append(doc)
+
+        return batch_docs
+
+    def train(
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
+    ) -> None:
+
+        batch_size = 64
+
+        for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
+
+            non_empty_examples = list(
+                filter(lambda x: x.get(attribute), training_data.training_examples)
+            )
+
+            batch_start_index = 0
+
+            while batch_start_index < len(non_empty_examples):
+
+                batch_end_index = min(
+                    batch_start_index + batch_size, len(non_empty_examples)
+                )
+                # Collect batch examples
+                batch_messages = non_empty_examples[batch_start_index:batch_end_index]
+
+                # Construct a doc with relevant features extracted(tokens, dense_features)
+                batch_docs = self._get_docs_for_batch(batch_messages, attribute)
+
+                for index, ex in enumerate(batch_messages):
+
+                    ex.set(LANGUAGE_MODEL_DOCS[attribute], batch_docs[index])
+
+                batch_start_index += batch_size
+
+    def process(self, message: Message, **kwargs: Any) -> None:
+
+        message.set(
+            LANGUAGE_MODEL_DOCS[TEXT],
+            self._get_docs_for_batch([message], attribute=TEXT)[0],
+        )
diff --git a/rasa/nlu/utils/hugging_face/registry.py b/rasa/nlu/utils/hugging_face/registry.py
new file mode 100644
index 000000000000..a6d68cde8747
--- /dev/null
+++ b/rasa/nlu/utils/hugging_face/registry.py
@@ -0,0 +1,95 @@
+import logging
+
+# Explicitly set logging level for this module before any import
+# because otherwise it logs tensorflow/pytorch versions
+logging.getLogger("transformers.file_utils").setLevel(logging.WARNING)
+
+from transformers import (
+    TFBertModel,
+    TFOpenAIGPTModel,
+    TFGPT2Model,
+    TFXLNetModel,
+    # TFXLMModel,
+    TFDistilBertModel,
+    TFRobertaModel,
+    BertTokenizer,
+    OpenAIGPTTokenizer,
+    GPT2Tokenizer,
+    XLNetTokenizer,
+    # XLMTokenizer,
+    DistilBertTokenizer,
+    RobertaTokenizer,
+)
+from rasa.nlu.utils.hugging_face.transformers_pre_post_processors import (
+    bert_tokens_pre_processor,
+    gpt_tokens_pre_processor,
+    xlnet_tokens_pre_processor,
+    roberta_tokens_pre_processor,
+    bert_embeddings_post_processor,
+    gpt_embeddings_post_processor,
+    xlnet_embeddings_post_processor,
+    roberta_embeddings_post_processor,
+    bert_tokens_cleaner,
+    openaigpt_tokens_cleaner,
+    gpt2_tokens_cleaner,
+    xlnet_tokens_cleaner,
+)
+
+
+model_class_dict = {
+    "bert": TFBertModel,
+    "gpt": TFOpenAIGPTModel,
+    "gpt2": TFGPT2Model,
+    "xlnet": TFXLNetModel,
+    # "xlm": TFXLMModel, # Currently doesn't work because of a bug in transformers library https://github.com/huggingface/transformers/issues/2729
+    "distilbert": TFDistilBertModel,
+    "roberta": TFRobertaModel,
+}
+model_tokenizer_dict = {
+    "bert": BertTokenizer,
+    "gpt": OpenAIGPTTokenizer,
+    "gpt2": GPT2Tokenizer,
+    "xlnet": XLNetTokenizer,
+    # "xlm": XLMTokenizer,
+    "distilbert": DistilBertTokenizer,
+    "roberta": RobertaTokenizer,
+}
+model_weights_defaults = {
+    "bert": "bert-base-uncased",
+    "gpt": "openai-gpt",
+    "gpt2": "gpt2",
+    "xlnet": "xlnet-base-cased",
+    # "xlm": "xlm-mlm-enfr-1024",
+    "distilbert": "distilbert-base-uncased",
+    "roberta": "roberta-base",
+}
+
+model_special_tokens_pre_processors = {
+    "bert": bert_tokens_pre_processor,
+    "gpt": gpt_tokens_pre_processor,
+    "gpt2": gpt_tokens_pre_processor,
+    "xlnet": xlnet_tokens_pre_processor,
+    # "xlm": xlm_tokens_pre_processor,
+    "distilbert": bert_tokens_pre_processor,
+    "roberta": roberta_tokens_pre_processor,
+}
+
+model_tokens_cleaners = {
+    "bert": bert_tokens_cleaner,
+    "gpt": openaigpt_tokens_cleaner,
+    "gpt2": gpt2_tokens_cleaner,
+    "xlnet": xlnet_tokens_cleaner,
+    # "xlm": xlm_tokens_pre_processor,
+    "distilbert": bert_tokens_cleaner,  # uses the same as BERT
+    "roberta": gpt2_tokens_cleaner,  # Uses the same as GPT2
+}
+
+model_embeddings_post_processors = {
+    "bert": bert_embeddings_post_processor,
+    "gpt": gpt_embeddings_post_processor,
+    "gpt2": gpt_embeddings_post_processor,
+    "xlnet": xlnet_embeddings_post_processor,
+    # "xlm": xlm_embeddings_post_processor,
+    "distilbert": bert_embeddings_post_processor,
+    "roberta": roberta_embeddings_post_processor,
+}
diff --git a/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py b/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
new file mode 100644
index 000000000000..27f02feedbde
--- /dev/null
+++ b/rasa/nlu/utils/hugging_face/transformers_pre_post_processors.py
@@ -0,0 +1,148 @@
+from typing import List, Tuple, Text
+import numpy as np
+
+
+def bert_tokens_pre_processor(token_ids: List[int]) -> List[int]:
+    """Add BERT style special tokens(CLS and SEP)"""
+    BERT_CLS_ID = 101
+    BERT_SEP_ID = 102
+
+    processed_tokens = token_ids
+
+    processed_tokens.insert(0, BERT_CLS_ID)
+    processed_tokens.append(BERT_SEP_ID)
+
+    return processed_tokens
+
+
+def gpt_tokens_pre_processor(token_ids: List[int]) -> List[int]:
+    return token_ids
+
+
+def xlnet_tokens_pre_processor(token_ids: List[int]) -> List[int]:
+    """Add XLNET style special tokens"""
+    XLNET_CLS_ID = 3
+    XLNET_SEP_ID = 4
+
+    token_ids.append(XLNET_SEP_ID)
+    token_ids.append(XLNET_CLS_ID)
+
+    return token_ids
+
+
+def roberta_tokens_pre_processor(token_ids: List[int]) -> List[int]:
+    """Add RoBERTa style special tokens"""
+    ROBERTA_BEG_ID = 0
+    ROBERTA_END_ID = 2
+
+    token_ids.insert(0, ROBERTA_BEG_ID)
+    token_ids.append(ROBERTA_END_ID)
+
+    return token_ids
+
+
+def xlm_tokens_pre_processor(token_ids: List[int]) -> List[int]:
+    """Add RoBERTa style special tokens"""
+    XLM_SEP_ID = 1
+
+    token_ids.insert(0, XLM_SEP_ID)
+    token_ids.append(XLM_SEP_ID)
+
+    return token_ids
+
+
+def bert_embeddings_post_processor(
+    sequence_embeddings: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Post process embeddings from BERT
+
+    by removing CLS and SEP embeddings and returning CLS token embedding as
+    sentence representation"""
+    sentence_embedding = sequence_embeddings[0]
+    post_processed_embedding = sequence_embeddings[1:-1]
+
+    return sentence_embedding, post_processed_embedding
+
+
+def gpt_embeddings_post_processor(
+    sequence_embeddings: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Post process embeddings from GPT models
+
+    by taking a mean over sequence embeddings and returning that as sentence
+    representation"""
+    sentence_embedding = np.mean(sequence_embeddings, axis=0)
+    post_processed_embedding = sequence_embeddings
+
+    return sentence_embedding, post_processed_embedding
+
+
+def xlnet_embeddings_post_processor(
+    sequence_embeddings: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Post process embeddings from XLNet models
+
+    by taking a mean over sequence embeddings and returning that as sentence
+    representation. Remove last two time steps corresponding
+    to special tokens from the sequence embeddings."""
+    post_processed_embedding = sequence_embeddings[:-2]
+    sentence_embedding = np.mean(post_processed_embedding, axis=0)
+
+    return sentence_embedding, post_processed_embedding
+
+
+def roberta_embeddings_post_processor(
+    sequence_embeddings: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Post process embeddings from Roberta models
+
+    by taking a mean over sequence embeddings and returning that as sentence
+    representation. Remove first and last time steps
+    corresponding to special tokens from the sequence embeddings."""
+
+    post_processed_embedding = sequence_embeddings[1:-1]
+    sentence_embedding = np.mean(post_processed_embedding, axis=0)
+
+    return sentence_embedding, post_processed_embedding
+
+
+def xlm_embeddings_post_processor(
+    sequence_embeddings: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Post process embeddings from XLM models
+
+    by taking a mean over sequence embeddings and returning that as sentence
+    representation. Remove first and last time steps
+    corresponding to special tokens from the sequence embeddings."""
+    post_processed_embedding = sequence_embeddings[1:-1]
+    sentence_embedding = np.mean(post_processed_embedding, axis=0)
+
+    return sentence_embedding, post_processed_embedding
+
+
+def bert_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
+    """Clean up tokens with the extra delimiters(##) BERT adds while breaking a token
+    into sub-tokens"""
+    tokens = [string.replace("##", "") for string in token_strings]
+    return [string for string in tokens if string]
+
+
+def openaigpt_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
+    """Clean up tokens with the extra delimiters(</w>) OpenAIGPT adds while breaking a
+    token into sub-tokens"""
+    tokens = [string.replace("</w>", "") for string in token_strings]
+    return [string for string in tokens if string]
+
+
+def gpt2_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
+    """Clean up tokens with the extra delimiters(</w>) GPT2 adds while breaking a token
+    into sub-tokens"""
+    tokens = [string.replace("Ġ", "") for string in token_strings]
+    return [string for string in tokens if string]
+
+
+def xlnet_tokens_cleaner(token_strings: List[Text]) -> List[Text]:
+    """Clean up tokens with the extra delimiters(▁) XLNet adds while breaking a token
+    into sub-tokens"""
+    tokens = [string.replace("▁", "") for string in token_strings]
+    return [string for string in tokens if string]
diff --git a/rasa/nlu/utils/mitie_utils.py b/rasa/nlu/utils/mitie_utils.py
index 2dfaa0202d72..91d37cc392d7 100644
--- a/rasa/nlu/utils/mitie_utils.py
+++ b/rasa/nlu/utils/mitie_utils.py
@@ -12,8 +12,6 @@
 
 class MitieNLP(Component):
 
-    provides = ["mitie_feature_extractor", "mitie_file"]
-
     defaults = {
         # name of the language model to load - this contains
         # the MITIE feature extractor
diff --git a/rasa/nlu/utils/spacy_utils.py b/rasa/nlu/utils/spacy_utils.py
index f06ff5f7459f..3eae015409d1 100644
--- a/rasa/nlu/utils/spacy_utils.py
+++ b/rasa/nlu/utils/spacy_utils.py
@@ -14,13 +14,10 @@
     from spacy.tokens.doc import Doc  # pytype: disable=import-error
     from rasa.nlu.model import Metadata
 
-from rasa.nlu.constants import TEXT_ATTRIBUTE, SPACY_DOCS, DENSE_FEATURIZABLE_ATTRIBUTES
+from rasa.nlu.constants import TEXT, SPACY_DOCS, DENSE_FEATURIZABLE_ATTRIBUTES
 
 
 class SpacyNLP(Component):
-    provides = ["spacy_nlp"] + [
-        SPACY_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
-    ]
 
     defaults = {
         # name of the language model to load - if it is not set
@@ -229,7 +226,7 @@ def train(
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
-        message.set(SPACY_DOCS[TEXT_ATTRIBUTE], self.doc_for_text(message.text))
+        message.set(SPACY_DOCS[TEXT], self.doc_for_text(message.text))
 
     @classmethod
     def load(
diff --git a/rasa/utils/common.py b/rasa/utils/common.py
index 1dfe48643631..1f8b05e26237 100644
--- a/rasa/utils/common.py
+++ b/rasa/utils/common.py
@@ -94,11 +94,7 @@ def update_apscheduler_log_level() -> None:
 def update_socketio_log_level() -> None:
     log_level = os.environ.get(ENV_LOG_LEVEL_LIBRARIES, DEFAULT_LOG_LEVEL_LIBRARIES)
 
-    socketio_loggers = [
-        "websockets.protocol",
-        "engineio.server",
-        "socketio.server",
-    ]
+    socketio_loggers = ["websockets.protocol", "engineio.server", "socketio.server"]
 
     for logger_name in socketio_loggers:
         logging.getLogger(logger_name).setLevel(log_level)
diff --git a/rasa/utils/io.py b/rasa/utils/io.py
index bed851312433..946bb4487793 100644
--- a/rasa/utils/io.py
+++ b/rasa/utils/io.py
@@ -3,6 +3,7 @@
 import json
 import logging
 import os
+import pickle
 import tarfile
 import tempfile
 import typing
@@ -157,6 +158,29 @@ def dump_obj_as_json_to_file(filename: Text, obj: Any) -> None:
     write_text_file(json.dumps(obj, indent=2), filename)
 
 
+def pickle_dump(filename: Union[Text, Path], obj: Any) -> None:
+    """Saves object to file.
+
+    Args:
+        filename: the filename to save the object to
+        obj: the object to store
+    """
+    with open(filename, "wb") as f:
+        pickle.dump(obj, f)
+
+
+def pickle_load(filename: Union[Text, Path]) -> Any:
+    """Loads an object from a file.
+
+    Args:
+        filename: the filename to load the object from
+
+    Returns: the loaded object
+    """
+    with open(filename, "rb") as f:
+        return pickle.load(f)
+
+
 def read_config_file(filename: Text) -> Dict[Text, Any]:
     """Parses a yaml configuration file. Content needs to be a dictionary
 
@@ -259,6 +283,12 @@ def create_temporary_file(data: Any, suffix: Text = "", mode: Text = "w+") -> Te
     return f.name
 
 
+def create_temporary_directory() -> Text:
+    """Creates a tempfile.TemporaryDirectory."""
+    f = tempfile.TemporaryDirectory()
+    return f.name
+
+
 def create_path(file_path: Text) -> None:
     """Makes sure all directories in the 'file_path' exists."""
 
@@ -267,7 +297,7 @@ def create_path(file_path: Text) -> None:
         os.makedirs(parent_dir)
 
 
-def create_directory_for_file(file_path: Text) -> None:
+def create_directory_for_file(file_path: Union[Text, Path]) -> None:
     """Creates any missing parent directories of this file path."""
 
     create_directory(os.path.dirname(file_path))
@@ -394,3 +424,35 @@ def zip_folder(folder: Text) -> Text:
 
     # WARN: not thread-safe!
     return shutil.make_archive(zipped_path.name, "zip", folder)
+
+
+def json_unpickle(file_name: Union[Text, Path]) -> Any:
+    """Unpickle an object from file using json.
+
+    Args:
+        file_name: the file to load the object from
+
+    Returns: the object
+    """
+    import jsonpickle.ext.numpy as jsonpickle_numpy
+    import jsonpickle
+
+    jsonpickle_numpy.register_handlers()
+
+    file_content = read_file(file_name)
+    return jsonpickle.loads(file_content)
+
+
+def json_pickle(file_name: Union[Text, Path], obj: Any) -> None:
+    """Pickle an object to a file using json.
+
+    Args:
+        file_name: the file to store the object to
+        obj: the object to store
+    """
+    import jsonpickle.ext.numpy as jsonpickle_numpy
+    import jsonpickle
+
+    jsonpickle_numpy.register_handlers()
+
+    write_text_file(jsonpickle.dumps(obj), file_name)
diff --git a/tests/nlu/training/__init__.py b/rasa/utils/tensorflow/__init__.py
similarity index 100%
rename from tests/nlu/training/__init__.py
rename to rasa/utils/tensorflow/__init__.py
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
new file mode 100644
index 000000000000..3e13221041d0
--- /dev/null
+++ b/rasa/utils/tensorflow/constants.py
@@ -0,0 +1,67 @@
+# constants for configuration parameters of our tensorflow models
+
+LABEL = "label"
+HIDDEN_LAYERS_SIZES = "hidden_layers_sizes"
+SHARE_HIDDEN_LAYERS = "share_hidden_layers"
+
+TRANSFORMER_SIZE = "transformer_size"
+NUM_TRANSFORMER_LAYERS = "number_of_transformer_layers"
+NUM_HEADS = "number_of_attention_heads"
+UNIDIRECTIONAL_ENCODER = "unidirectional_encoder"
+KEY_RELATIVE_ATTENTION = "use_key_relative_attention"
+VALUE_RELATIVE_ATTENTION = "use_value_relative_attention"
+MAX_RELATIVE_POSITION = "max_relative_position"
+
+BATCH_SIZES = "batch_size"
+BATCH_STRATEGY = "batch_strategy"
+EPOCHS = "epochs"
+RANDOM_SEED = "random_seed"
+LEARNING_RATE = "learning_rate"
+
+DENSE_DIMENSION = "dense_dimension"
+EMBEDDING_DIMENSION = "embedding_dimension"
+
+SIMILARITY_TYPE = "similarity_type"
+LOSS_TYPE = "loss_type"
+NUM_NEG = "number_of_negative_examples"
+MAX_POS_SIM = "maximum_positive_similarity"
+MAX_NEG_SIM = "maximum_negative_similarity"
+USE_MAX_NEG_SIM = "use_maximum_negative_similarity"
+
+SCALE_LOSS = "scale_loss"
+REGULARIZATION_CONSTANT = "regularization_constant"
+NEGATIVE_MARGIN_SCALE = "negative_margin_scale"
+DROP_RATE = "drop_rate"
+DROP_RATE_ATTENTION = "drop_rate_attention"
+DROP_RATE_DIALOGUE = "drop_rate_dialogue"
+DROP_RATE_LABEL = "drop_rate_label"
+
+WEIGHT_SPARSITY = "weight_sparsity"
+
+EVAL_NUM_EPOCHS = "evaluate_every_number_of_epochs"
+EVAL_NUM_EXAMPLES = "evaluate_on_number_of_examples"
+
+INTENT_CLASSIFICATION = "intent_classification"
+ENTITY_RECOGNITION = "entity_recognition"
+MASKED_LM = "use_masked_language_model"
+
+SPARSE_INPUT_DROPOUT = "use_sparse_input_dropout"
+
+RANKING_LENGTH = "ranking_length"
+
+BILOU_FLAG = "BILOU_flag"
+
+RETRIEVAL_INTENT = "retrieval_intent"
+
+SOFTMAX = "softmax"
+MARGIN = "margin"
+AUTO = "auto"
+INNER = "inner"
+COSINE = "cosine"
+
+BALANCED = "balanced"
+SEQUENCE = "sequence"
+
+POOLING = "pooling"
+MAX_POOLING = "max"
+MEAN_POOLING = "mean"
diff --git a/rasa/utils/tensorflow/environment.py b/rasa/utils/tensorflow/environment.py
new file mode 100644
index 000000000000..cc8977f38e1b
--- /dev/null
+++ b/rasa/utils/tensorflow/environment.py
@@ -0,0 +1,142 @@
+import logging
+import os
+from typing import Text, Dict
+import typing
+import rasa.utils.common as rasa_utils
+from rasa.constants import (
+    ENV_GPU_CONFIG,
+    ENV_CPU_INTER_OP_CONFIG,
+    ENV_CPU_INTRA_OP_CONFIG,
+)
+
+if typing.TYPE_CHECKING:
+    from tensorflow import config as tf_config
+
+logger = logging.getLogger(__name__)
+
+
+def _setup_gpu_environment() -> None:
+    """Set configuration for TensorFlow GPU environment based on the environment variable set."""
+
+    gpu_memory_config = os.getenv(ENV_GPU_CONFIG)
+
+    if not gpu_memory_config:
+        return
+
+    # Import from tensorflow only if necessary (environment variable was set)
+    from tensorflow import config as tf_config
+
+    parsed_gpu_config = _parse_gpu_config(gpu_memory_config)
+    physical_gpus = tf_config.list_physical_devices("GPU")
+
+    # Logic taken from https://www.tensorflow.org/guide/gpu
+    if physical_gpus:
+        for gpu_id, gpu_id_memory in parsed_gpu_config.items():
+            _allocate_gpu_memory(physical_gpus[gpu_id], gpu_id_memory)
+
+    else:
+        rasa_utils.raise_warning(
+            f"You have an environment variable '{ENV_GPU_CONFIG}' set but no GPUs were detected to configure."
+        )
+
+
+def _allocate_gpu_memory(
+    gpu_instance: "tf_config.PhysicalDevice", logical_memory: int
+) -> None:
+    """Create a new logical device for the requested amount of memory.
+
+    Args:
+        gpu_instance: PhysicalDevice instance of a GPU device.
+        logical_memory: Absolute amount of memory to be allocated to the new logical device.
+    """
+
+    from tensorflow import config as tf_config
+
+    try:
+        tf_config.experimental.set_virtual_device_configuration(
+            gpu_instance,
+            [
+                tf_config.experimental.VirtualDeviceConfiguration(
+                    memory_limit=logical_memory
+                )
+            ],
+        )
+
+    except RuntimeError:
+        # Helper explanation of where the error comes from
+        raise RuntimeError(
+            "Error while setting up tensorflow environment. "
+            "Virtual devices must be set before GPUs have been initialized."
+        )
+
+
+def _parse_gpu_config(gpu_memory_config: Text) -> Dict[int, int]:
+    """Parse GPU configuration variable from a string to a dict.
+
+    Args:
+        gpu_memory_config: String containing the configuration for GPU usage.
+
+    Returns:
+        Parsed configuration as a dictionary with GPU IDs as keys and requested memory as the value.
+    """
+
+    # gpu_config is of format "gpu_id_1:gpu_id_1_memory, gpu_id_2: gpu_id_2_memory"
+    # Parse it and store in a dictionary
+    parsed_gpu_config = {}
+
+    try:
+        for instance in gpu_memory_config.split(","):
+            instance_gpu_id, instance_gpu_mem = instance.split(":")
+            instance_gpu_id = int(instance_gpu_id)
+            instance_gpu_mem = int(instance_gpu_mem)
+
+            parsed_gpu_config[instance_gpu_id] = instance_gpu_mem
+    except ValueError:
+        # Helper explanation of where the error comes from
+        raise ValueError(
+            f"Error parsing GPU configuration. Please cross-check the format of '{ENV_GPU_CONFIG}' "
+            f"at https://rasa.com/docs/rasa/api/tensorflow_usage.html#restricting-absolute-gpu-memory-available ."
+        )
+
+    return parsed_gpu_config
+
+
+def _setup_cpu_environment() -> None:
+    """Set configuration for the CPU environment based on the environment variable set."""
+
+    inter_op_parallel_threads = os.getenv(ENV_CPU_INTER_OP_CONFIG)
+    intra_op_parallel_threads = os.getenv(ENV_CPU_INTRA_OP_CONFIG)
+
+    if not inter_op_parallel_threads and not intra_op_parallel_threads:
+        return
+
+    from tensorflow import config as tf_config
+
+    if inter_op_parallel_threads:
+        try:
+            inter_op_parallel_threads = int(inter_op_parallel_threads.strip())
+        except ValueError:
+            raise ValueError(
+                f"Error parsing the environment variable '{ENV_CPU_INTER_OP_CONFIG}'. Please "
+                f"cross-check the value."
+            )
+
+        tf_config.threading.set_inter_op_parallelism_threads(inter_op_parallel_threads)
+
+    if intra_op_parallel_threads:
+        try:
+            intra_op_parallel_threads = int(intra_op_parallel_threads.strip())
+        except ValueError:
+            raise ValueError(
+                f"Error parsing the environment variable '{ENV_CPU_INTRA_OP_CONFIG}'. Please "
+                f"cross-check the value."
+            )
+
+        tf_config.threading.set_intra_op_parallelism_threads(intra_op_parallel_threads)
+
+
+def setup_tf_environment() -> None:
+    """Setup CPU and GPU related environment settings for TensorFlow."""
+
+    _setup_cpu_environment()
+    _setup_gpu_environment()
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
new file mode 100644
index 000000000000..55c0ddbe3a0f
--- /dev/null
+++ b/rasa/utils/tensorflow/layers.py
@@ -0,0 +1,632 @@
+import logging
+from typing import List, Optional, Text, Tuple, Callable, Union, Any
+import tensorflow as tf
+import tensorflow_addons as tfa
+from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.keras import backend as K
+from rasa.utils.tensorflow.constants import SOFTMAX, MARGIN, COSINE, INNER
+
+logger = logging.getLogger(__name__)
+
+
+class SparseDropout(tf.keras.layers.Dropout):
+    def call(
+        self, inputs: tf.Tensor, training: Optional[Union[tf.Tensor, bool]] = None
+    ) -> tf.Tensor:
+        if training is None:
+            training = K.learning_phase()
+
+        def dropped_inputs() -> tf.Tensor:
+            to_retain_prob = tf.random.uniform(
+                tf.shape(inputs.values), 0, 1, inputs.values.dtype
+            )
+            to_retain = tf.greater_equal(to_retain_prob, self.rate)
+            return tf.sparse.retain(inputs, to_retain)
+
+        outputs = tf_utils.smart_cond(
+            training, dropped_inputs, lambda: tf.identity(inputs)
+        )
+        # need to explicitly set shape, because it becomes dynamic after `retain`
+        # noinspection PyProtectedMember
+        outputs._dense_shape = inputs._dense_shape
+
+        return outputs
+
+
+class DenseForSparse(tf.keras.layers.Dense):
+    """Dense layer for sparse input tensor."""
+
+    def __init__(self, reg_lambda: float = 0, **kwargs: Any) -> None:
+        if reg_lambda > 0:
+            regularizer = tf.keras.regularizers.l2(reg_lambda)
+        else:
+            regularizer = None
+
+        super().__init__(kernel_regularizer=regularizer, **kwargs)
+
+    def call(self, inputs: tf.SparseTensor) -> tf.Tensor:
+        if not isinstance(inputs, tf.SparseTensor):
+            raise ValueError("Input tensor should be sparse.")
+
+        # outputs will be 2D
+        outputs = tf.sparse.sparse_dense_matmul(
+            tf.sparse.reshape(inputs, [-1, tf.shape(inputs)[-1]]), self.kernel
+        )
+
+        if len(inputs.shape) == 3:
+            # reshape back
+            outputs = tf.reshape(
+                outputs, (tf.shape(inputs)[0], tf.shape(inputs)[1], -1)
+            )
+
+        if self.use_bias:
+            outputs = tf.nn.bias_add(outputs, self.bias)
+        if self.activation is not None:
+            return self.activation(outputs)
+        return outputs
+
+
+class DenseWithSparseWeights(tf.keras.layers.Dense):
+    def __init__(self, sparsity: float = 0.8, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self.sparsity = sparsity
+
+    def build(self, input_shape: tf.TensorShape) -> None:
+        super().build(input_shape)
+        # create random mask to set some weights to 0
+        kernel_mask = tf.random.uniform(tf.shape(self.kernel), 0, 1)
+        kernel_mask = tf.cast(
+            tf.greater_equal(kernel_mask, self.sparsity), self.kernel.dtype
+        )
+        self.kernel_mask = tf.Variable(
+            initial_value=kernel_mask, trainable=False, name="kernel_mask"
+        )
+
+    def call(self, inputs: tf.Tensor) -> tf.Tensor:
+        # set some weights to 0 according to precomputed mask
+        self.kernel.assign(self.kernel * self.kernel_mask)
+        return super().call(inputs)
+
+
+class Ffnn(tf.keras.layers.Layer):
+    """Create feed-forward network with hidden layers and name suffix."""
+
+    def __init__(
+        self,
+        layer_sizes: List[int],
+        dropout_rate: float,
+        reg_lambda: float,
+        sparsity: float,
+        layer_name_suffix: Text,
+    ) -> None:
+        super().__init__(name=f"ffnn_{layer_name_suffix}")
+
+        l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
+        self._ffn_layers = []
+        for i, layer_size in enumerate(layer_sizes):
+            self._ffn_layers.append(
+                DenseWithSparseWeights(
+                    units=layer_size,
+                    sparsity=sparsity,
+                    activation=tfa.activations.gelu,
+                    kernel_regularizer=l2_regularizer,
+                    name=f"hidden_layer_{layer_name_suffix}_{i}",
+                )
+            )
+            self._ffn_layers.append(tf.keras.layers.Dropout(dropout_rate))
+
+    def call(
+        self, x: tf.Tensor, training: Optional[Union[tf.Tensor, bool]] = None
+    ) -> tf.Tensor:
+        for layer in self._ffn_layers:
+            x = layer(x, training=training)
+
+        return x
+
+
+class Embed(tf.keras.layers.Layer):
+    """Create dense embedding layer with a name."""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        reg_lambda: float,
+        layer_name_suffix: Text,
+        similarity_type: Optional[Text] = None,
+    ) -> None:
+        super().__init__(name=f"embed_{layer_name_suffix}")
+
+        self.similarity_type = similarity_type
+        if self.similarity_type and self.similarity_type not in {COSINE, INNER}:
+            raise ValueError(
+                f"Wrong similarity type '{self.similarity_type}', "
+                f"should be '{COSINE}' or '{INNER}'."
+            )
+
+        regularizer = tf.keras.regularizers.l2(reg_lambda)
+        self._dense = tf.keras.layers.Dense(
+            units=embed_dim,
+            activation=None,
+            kernel_regularizer=regularizer,
+            name=f"embed_layer_{layer_name_suffix}",
+        )
+
+    def call(self, x: tf.Tensor) -> tf.Tensor:
+        x = self._dense(x)
+        if self.similarity_type == COSINE:
+            x = tf.nn.l2_normalize(x, axis=-1)
+
+        return x
+
+
+class InputMask(tf.keras.layers.Layer):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+
+        self._masking_prob = 0.85
+        self._mask_vector_prob = 0.7
+        self._random_vector_prob = 0.1
+
+    def build(self, input_shape: tf.TensorShape) -> None:
+        self.mask_vector = self.add_weight(
+            shape=(1, 1, input_shape[-1]), name="mask_vector"
+        )
+        self.built = True
+
+    def call(
+        self,
+        x: tf.Tensor,
+        mask: tf.Tensor,
+        training: Optional[Union[tf.Tensor, bool]] = None,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        """Randomly mask input sequences."""
+
+        if training is None:
+            training = K.learning_phase()
+
+        lm_mask_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype) * mask
+        lm_mask_bool = tf.greater_equal(lm_mask_prob, self._masking_prob)
+
+        def x_masked() -> tf.Tensor:
+            x_random_pad = tf.random.uniform(
+                tf.shape(x), tf.reduce_min(x), tf.reduce_max(x), x.dtype
+            ) * (1 - mask)
+            # shuffle over batch dim
+            x_shuffle = tf.random.shuffle(x * mask + x_random_pad)
+
+            # shuffle over sequence dim
+            x_shuffle = tf.transpose(x_shuffle, [1, 0, 2])
+            x_shuffle = tf.random.shuffle(x_shuffle)
+            x_shuffle = tf.transpose(x_shuffle, [1, 0, 2])
+
+            # shuffle doesn't support backprop
+            x_shuffle = tf.stop_gradient(x_shuffle)
+
+            mask_vector = tf.tile(self.mask_vector, (tf.shape(x)[0], tf.shape(x)[1], 1))
+
+            other_prob = tf.random.uniform(tf.shape(mask), 0, 1, mask.dtype)
+            other_prob = tf.tile(other_prob, (1, 1, x.shape[-1]))
+            x_other = tf.where(
+                other_prob < self._mask_vector_prob,
+                mask_vector,
+                tf.where(
+                    other_prob < self._mask_vector_prob + self._random_vector_prob,
+                    x_shuffle,
+                    x,
+                ),
+            )
+
+            return tf.where(tf.tile(lm_mask_bool, (1, 1, x.shape[-1])), x_other, x)
+
+        return (
+            tf_utils.smart_cond(training, x_masked, lambda: tf.identity(x)),
+            lm_mask_bool,
+        )
+
+
+class CRF(tf.keras.layers.Layer):
+    def __init__(
+        self, num_tags: int, reg_lambda: float, name: Optional[Text] = None
+    ) -> None:
+        super().__init__(name=name)
+        self.num_tags = num_tags
+        self.transition_regularizer = tf.keras.regularizers.l2(reg_lambda)
+
+    def build(self, input_shape: tf.TensorShape) -> None:
+        # the weights should be created in `build` to apply random_seed
+        self.transition_params = self.add_weight(
+            shape=(self.num_tags, self.num_tags),
+            regularizer=self.transition_regularizer,
+            name="transitions",
+        )
+        self.built = True
+
+    def call(self, logits: tf.Tensor, sequence_lengths: tf.Tensor) -> tf.Tensor:
+        pred_ids, _ = tfa.text.crf.crf_decode(
+            logits, self.transition_params, sequence_lengths
+        )
+        # set prediction index for padding to `0`
+        mask = tf.sequence_mask(
+            sequence_lengths, maxlen=tf.shape(pred_ids)[1], dtype=pred_ids.dtype
+        )
+
+        return pred_ids * mask
+
+    def loss(
+        self, logits: tf.Tensor, tag_indices: tf.Tensor, sequence_lengths: tf.Tensor
+    ) -> tf.Tensor:
+        log_likelihood, _ = tfa.text.crf.crf_log_likelihood(
+            logits, tag_indices, sequence_lengths, self.transition_params
+        )
+        return tf.reduce_mean(-log_likelihood)
+
+
+class DotProductLoss(tf.keras.layers.Layer):
+    def __init__(
+        self,
+        num_neg: int,
+        loss_type: Text,
+        mu_pos: float,
+        mu_neg: float,
+        use_max_sim_neg: bool,
+        neg_lambda: float,
+        scale_loss: bool,
+        name: Optional[Text] = None,
+        parallel_iterations: int = 1000,
+        same_sampling: bool = False,
+    ) -> None:
+        super().__init__(name=name)
+        self.num_neg = num_neg
+        self.loss_type = loss_type
+        self.mu_pos = mu_pos
+        self.mu_neg = mu_neg
+        self.use_max_sim_neg = use_max_sim_neg
+        self.neg_lambda = neg_lambda
+        self.scale_loss = scale_loss
+        self.parallel_iterations = parallel_iterations
+        self.same_sampling = same_sampling
+
+    @staticmethod
+    def _make_flat(x: tf.Tensor) -> tf.Tensor:
+        """Make tensor 2D."""
+
+        return tf.reshape(x, (-1, x.shape[-1]))
+
+    def _random_indices(
+        self, batch_size: tf.Tensor, total_candidates: tf.Tensor
+    ) -> tf.Tensor:
+        def rand_idxs() -> tf.Tensor:
+            """Create random tensor of indices"""
+
+            # (1, num_neg)
+            return tf.expand_dims(
+                tf.random.shuffle(tf.range(total_candidates))[: self.num_neg], 0
+            )
+
+        if self.same_sampling:
+            return tf.tile(rand_idxs(), (batch_size, 1))
+
+        def cond(idx: tf.Tensor, out: tf.Tensor) -> tf.Tensor:
+            """Condition for while loop"""
+            return idx < batch_size
+
+        def body(idx: tf.Tensor, out: tf.Tensor) -> List[tf.Tensor]:
+            """Body of the while loop"""
+            return [
+                # increment counter
+                idx + 1,
+                # add random indices
+                tf.concat([out, rand_idxs()], 0),
+            ]
+
+        # first tensor already created
+        idx1 = tf.constant(1)
+        # create first random array of indices
+        out1 = rand_idxs()  # (1, num_neg)
+
+        return tf.while_loop(
+            cond,
+            body,
+            loop_vars=[idx1, out1],
+            shape_invariants=[idx1.shape, tf.TensorShape([None, self.num_neg])],
+            parallel_iterations=self.parallel_iterations,
+            back_prop=False,
+        )[1]
+
+    @staticmethod
+    def _sample_idxs(batch_size: tf.Tensor, x: tf.Tensor, idxs: tf.Tensor) -> tf.Tensor:
+        """Sample negative examples for given indices"""
+
+        tiled = tf.tile(tf.expand_dims(x, 0), (batch_size, 1, 1))
+
+        return tf.gather(tiled, idxs, batch_dims=1)
+
+    def _get_bad_mask(
+        self, labels: tf.Tensor, target_labels: tf.Tensor, idxs: tf.Tensor
+    ) -> tf.Tensor:
+        """Calculate bad mask for given indices.
+
+        Checks that input features are different for positive negative samples.
+        """
+
+        pos_labels = tf.expand_dims(target_labels, axis=-2)
+        neg_labels = self._sample_idxs(tf.shape(target_labels)[0], labels, idxs)
+
+        return tf.cast(
+            tf.reduce_all(tf.equal(neg_labels, pos_labels), axis=-1), pos_labels.dtype
+        )
+
+    def _get_negs(
+        self, embeds: tf.Tensor, labels: tf.Tensor, target_labels: tf.Tensor
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        """Get negative examples from given tensor."""
+
+        embeds_flat = self._make_flat(embeds)
+        labels_flat = self._make_flat(labels)
+        target_labels_flat = self._make_flat(target_labels)
+
+        total_candidates = tf.shape(embeds_flat)[0]
+        target_size = tf.shape(target_labels_flat)[0]
+
+        neg_ids = self._random_indices(target_size, total_candidates)
+
+        neg_embeds = self._sample_idxs(target_size, embeds_flat, neg_ids)
+        bad_negs = self._get_bad_mask(labels_flat, target_labels_flat, neg_ids)
+
+        # check if inputs have sequence dimension
+        if len(target_labels.shape) == 3:
+            # tensors were flattened for sampling, reshape back
+            # add sequence dimension if it was present in the inputs
+            target_shape = tf.shape(target_labels)
+            neg_embeds = tf.reshape(
+                neg_embeds, (target_shape[0], target_shape[1], -1, embeds.shape[-1])
+            )
+            bad_negs = tf.reshape(bad_negs, (target_shape[0], target_shape[1], -1))
+
+        return neg_embeds, bad_negs
+
+    def _sample_negatives(
+        self,
+        inputs_embed: tf.Tensor,
+        labels_embed: tf.Tensor,
+        labels: tf.Tensor,
+        all_labels_embed: tf.Tensor,
+        all_labels: tf.Tensor,
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
+        """Sample negative examples."""
+
+        pos_inputs_embed = tf.expand_dims(inputs_embed, axis=-2)
+        pos_labels_embed = tf.expand_dims(labels_embed, axis=-2)
+
+        # sample negative inputs
+        neg_inputs_embed, inputs_bad_negs = self._get_negs(inputs_embed, labels, labels)
+        # sample negative labels
+        neg_labels_embed, labels_bad_negs = self._get_negs(
+            all_labels_embed, all_labels, labels
+        )
+        return (
+            pos_inputs_embed,
+            pos_labels_embed,
+            neg_inputs_embed,
+            neg_labels_embed,
+            inputs_bad_negs,
+            labels_bad_negs,
+        )
+
+    @staticmethod
+    def sim(a: tf.Tensor, b: tf.Tensor, mask: Optional[tf.Tensor] = None) -> tf.Tensor:
+        """Calculate similarity between given tensors."""
+
+        sim = tf.reduce_sum(a * b, axis=-1)
+        if mask is not None:
+            sim *= tf.expand_dims(mask, 2)
+
+        return sim
+
+    @staticmethod
+    def confidence_from_sim(sim: tf.Tensor, similarity_type: Text) -> tf.Tensor:
+        if similarity_type == COSINE:
+            # clip negative values to zero
+            return tf.nn.relu(sim)
+        else:
+            # normalize result to [0, 1] with softmax
+            return tf.nn.softmax(sim)
+
+    def _train_sim(
+        self,
+        pos_inputs_embed: tf.Tensor,
+        pos_labels_embed: tf.Tensor,
+        neg_inputs_embed: tf.Tensor,
+        neg_labels_embed: tf.Tensor,
+        inputs_bad_negs: tf.Tensor,
+        labels_bad_negs: tf.Tensor,
+        mask: Optional[tf.Tensor],
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
+        """Define similarity."""
+
+        # calculate similarity with several
+        # embedded actions for the loss
+        neg_inf = tf.constant(-1e9)
+
+        sim_pos = self.sim(pos_inputs_embed, pos_labels_embed, mask)
+        sim_neg_il = (
+            self.sim(pos_inputs_embed, neg_labels_embed, mask)
+            + neg_inf * labels_bad_negs
+        )
+        sim_neg_ll = (
+            self.sim(pos_labels_embed, neg_labels_embed, mask)
+            + neg_inf * labels_bad_negs
+        )
+        sim_neg_ii = (
+            self.sim(pos_inputs_embed, neg_inputs_embed, mask)
+            + neg_inf * inputs_bad_negs
+        )
+        sim_neg_li = (
+            self.sim(pos_labels_embed, neg_inputs_embed, mask)
+            + neg_inf * inputs_bad_negs
+        )
+
+        # output similarities between user input and bot actions
+        # and similarities between bot actions and similarities between user inputs
+        return sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li
+
+    @staticmethod
+    def _calc_accuracy(sim_pos: tf.Tensor, sim_neg: tf.Tensor) -> tf.Tensor:
+        """Calculate accuracy."""
+
+        max_all_sim = tf.reduce_max(tf.concat([sim_pos, sim_neg], axis=-1), axis=-1)
+        return tf.reduce_mean(
+            tf.cast(
+                tf.math.equal(max_all_sim, tf.squeeze(sim_pos, axis=-1)), tf.float32
+            )
+        )
+
+    def _loss_margin(
+        self,
+        sim_pos: tf.Tensor,
+        sim_neg_il: tf.Tensor,
+        sim_neg_ll: tf.Tensor,
+        sim_neg_ii: tf.Tensor,
+        sim_neg_li: tf.Tensor,
+        mask: Optional[tf.Tensor],
+    ) -> tf.Tensor:
+        """Define max margin loss."""
+
+        # loss for maximizing similarity with correct action
+        loss = tf.maximum(0.0, self.mu_pos - tf.squeeze(sim_pos, axis=-1))
+
+        # loss for minimizing similarity with `num_neg` incorrect actions
+        if self.use_max_sim_neg:
+            # minimize only maximum similarity over incorrect actions
+            max_sim_neg_il = tf.reduce_max(sim_neg_il, axis=-1)
+            loss += tf.maximum(0.0, self.mu_neg + max_sim_neg_il)
+        else:
+            # minimize all similarities with incorrect actions
+            max_margin = tf.maximum(0.0, self.mu_neg + sim_neg_il)
+            loss += tf.reduce_sum(max_margin, axis=-1)
+
+        # penalize max similarity between pos bot and neg bot embeddings
+        max_sim_neg_ll = tf.maximum(
+            0.0, self.mu_neg + tf.reduce_max(sim_neg_ll, axis=-1)
+        )
+        loss += max_sim_neg_ll * self.neg_lambda
+
+        # penalize max similarity between pos dial and neg dial embeddings
+        max_sim_neg_ii = tf.maximum(
+            0.0, self.mu_neg + tf.reduce_max(sim_neg_ii, axis=-1)
+        )
+        loss += max_sim_neg_ii * self.neg_lambda
+
+        # penalize max similarity between pos bot and neg dial embeddings
+        max_sim_neg_li = tf.maximum(
+            0.0, self.mu_neg + tf.reduce_max(sim_neg_li, axis=-1)
+        )
+        loss += max_sim_neg_li * self.neg_lambda
+
+        if mask is not None:
+            # mask loss for different length sequences
+            loss *= mask
+            # average the loss over sequence length
+            loss = tf.reduce_sum(loss, axis=-1) / tf.reduce_sum(mask, axis=1)
+
+        # average the loss over the batch
+        loss = tf.reduce_mean(loss)
+
+        return loss
+
+    def _loss_softmax(
+        self,
+        sim_pos: tf.Tensor,
+        sim_neg_il: tf.Tensor,
+        sim_neg_ll: tf.Tensor,
+        sim_neg_ii: tf.Tensor,
+        sim_neg_li: tf.Tensor,
+        mask: Optional[tf.Tensor],
+    ) -> tf.Tensor:
+        """Define softmax loss."""
+
+        logits = tf.concat(
+            [sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li], axis=-1
+        )
+
+        # create label_ids for softmax
+        label_ids = tf.zeros_like(logits[..., 0], tf.int32)
+
+        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+            labels=label_ids, logits=logits
+        )
+
+        if mask is None:
+            mask = 1.0
+
+        if self.scale_loss:
+            # mask loss by prediction confidence
+            pos_pred = tf.stop_gradient(tf.nn.softmax(logits)[..., 0])
+            # the scaling parameters are found empirically
+            scale_mask = mask * tf.pow(tf.minimum(0.5, 1 - pos_pred) / 0.5, 4)
+            # scale loss
+            loss *= scale_mask
+
+        if len(loss.shape) == 2:
+            # average over the sequence
+            loss = tf.reduce_sum(loss, axis=-1) / tf.reduce_sum(mask, axis=-1)
+
+        # average the loss over all examples
+        loss = tf.reduce_mean(loss)
+
+        return loss
+
+    @property
+    def _chosen_loss(self) -> Callable:
+        """Use loss depending on given option."""
+
+        if self.loss_type == MARGIN:
+            return self._loss_margin
+        elif self.loss_type == SOFTMAX:
+            return self._loss_softmax
+        else:
+            raise ValueError(
+                f"Wrong loss type '{self.loss_type}', "
+                f"should be '{MARGIN}' or '{SOFTMAX}'"
+            )
+
+    def call(
+        self,
+        inputs_embed: tf.Tensor,
+        labels_embed: tf.Tensor,
+        labels: tf.Tensor,
+        all_labels_embed: tf.Tensor,
+        all_labels: tf.Tensor,
+        mask: Optional[tf.Tensor] = None,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        """Calculate loss and accuracy."""
+
+        (
+            pos_inputs_embed,
+            pos_labels_embed,
+            neg_inputs_embed,
+            neg_labels_embed,
+            inputs_bad_negs,
+            labels_bad_negs,
+        ) = self._sample_negatives(
+            inputs_embed, labels_embed, labels, all_labels_embed, all_labels
+        )
+
+        # calculate similarities
+        sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li = self._train_sim(
+            pos_inputs_embed,
+            pos_labels_embed,
+            neg_inputs_embed,
+            neg_labels_embed,
+            inputs_bad_negs,
+            labels_bad_negs,
+            mask,
+        )
+
+        acc = self._calc_accuracy(sim_pos, sim_neg_il)
+
+        loss = self._chosen_loss(
+            sim_pos, sim_neg_il, sim_neg_ll, sim_neg_ii, sim_neg_li, mask
+        )
+
+        return loss, acc
diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
new file mode 100644
index 000000000000..0fbdba4e9d86
--- /dev/null
+++ b/rasa/utils/tensorflow/model_data.py
@@ -0,0 +1,591 @@
+import logging
+
+import numpy as np
+import scipy.sparse
+import tensorflow as tf
+
+from sklearn.model_selection import train_test_split
+from typing import Optional, Dict, Text, List, Tuple, Any, Union, Generator, NamedTuple
+from collections import defaultdict
+from rasa.utils.tensorflow.constants import BALANCED, SEQUENCE
+
+logger = logging.getLogger(__name__)
+
+
+# Mapping of feature name to a list of numpy arrays representing the actual features
+# For example:
+# "text_features" -> [
+#   "numpy array containing dense features for every training example",
+#   "numpy array containing sparse features for every training example"
+# ]
+Data = Dict[Text, List[np.ndarray]]
+
+
+class FeatureSignature(NamedTuple):
+    """Stores the shape and the type (sparse vs dense) of features."""
+
+    is_sparse: bool
+    shape: List[int]
+
+
+class RasaModelData:
+    """Data object used for all RasaModels.
+
+    It contains all features needed to train the models.
+    """
+
+    def __init__(
+        self, label_key: Optional[Text] = None, data: Optional[Data] = None
+    ) -> None:
+        """
+        Initializes the RasaModelData object.
+
+        Args:
+            label_key: the label_key used for balancing, etc.
+            data: the data holding the features
+        """
+
+        self.data = data or {}
+        self.label_key = label_key
+        # should be updated when features are added
+        self.num_examples = self.number_of_examples()
+
+    def get_only(self, key: Text) -> Optional[np.ndarray]:
+        if key in self.data:
+            return self.data[key][0]
+        else:
+            return None
+
+    def get(self, key: Text) -> List[np.ndarray]:
+        if key in self.data:
+            return self.data[key]
+        else:
+            return []
+
+    def items(self):
+        return self.data.items()
+
+    def values(self):
+        return self.data.values()
+
+    def keys(self):
+        return self.data.keys()
+
+    def first_data_example(self) -> Data:
+        return {
+            feature_name: [feature[:1] for feature in features]
+            for feature_name, features in self.data.items()
+        }
+
+    def feature_not_exist(self, key: Text) -> bool:
+        """Check if feature key is present and features are available."""
+
+        return key not in self.data or not self.data[key]
+
+    def is_empty(self) -> bool:
+        """Checks if data is set."""
+
+        return not self.data
+
+    def number_of_examples(self, data: Optional[Data] = None) -> int:
+        """Obtain number of examples in data.
+
+        Raises: A ValueError if number of examples differ for different features.
+        """
+
+        if not data:
+            data = self.data
+
+        if not data:
+            return 0
+
+        example_lengths = [v.shape[0] for values in data.values() for v in values]
+
+        # check if number of examples is the same for all values
+        if not all(length == example_lengths[0] for length in example_lengths):
+            raise ValueError(
+                f"Number of examples differs for keys '{data.keys()}'. Number of "
+                f"examples should be the same for all data."
+            )
+
+        return example_lengths[0]
+
+    def feature_dimension(self, key: Text) -> int:
+        """Get the feature dimension of the given key."""
+
+        number_of_features = 0
+        for data in self.data[key]:
+            if data.size > 0:
+                number_of_features += data[0].shape[-1]
+
+        return number_of_features
+
+    def add_features(self, key: Text, features: List[np.ndarray]):
+        """Add list of features to data under specified key.
+
+        Should update number of examples.
+        """
+
+        if not features:
+            return
+
+        if key in self.data:
+            raise ValueError(f"Key '{key}' already exists in RasaModelData.")
+
+        self.data[key] = []
+
+        for data in features:
+            if data.size > 0:
+                self.data[key].append(data)
+
+        if not self.data[key]:
+            del self.data[key]
+
+        # update number of examples
+        self.num_examples = self.number_of_examples()
+
+    def add_mask(self, key: Text, from_key: Text):
+        """Calculate mask for given key and put it under specified key."""
+
+        if not self.data.get(from_key):
+            return
+
+        self.data[key] = []
+
+        for data in self.data[from_key]:
+            if data.size > 0:
+                # explicitly add last dimension to mask
+                # to track correctly dynamic sequences
+                mask = np.array([np.ones((x.shape[0], 1)) for x in data])
+                self.data[key].append(mask)
+                break
+
+    def split(
+        self, number_of_test_examples: int, random_seed: int
+    ) -> Tuple["RasaModelData", "RasaModelData"]:
+        """Create random hold out test set using stratified split."""
+
+        self._check_label_key()
+
+        if self.label_key is None:
+            # randomly split data as no label key is split
+            multi_values = [v for values in self.data.values() for v in values]
+            solo_values = [[] for values in self.data.values() for v in values]
+            stratify = None
+        else:
+            # make sure that examples for each label value are in both split sets
+            label_ids = self._create_label_ids(self.data[self.label_key][0])
+            label_counts = dict(zip(*np.unique(label_ids, return_counts=True, axis=0)))
+
+            self._check_train_test_sizes(number_of_test_examples, label_counts)
+
+            counts = np.array([label_counts[label] for label in label_ids])
+            # we perform stratified train test split,
+            # which insures every label is present in the train and test data
+            # this operation can be performed only for labels
+            # that contain several data points
+            multi_values = [
+                v[counts > 1] for values in self.data.values() for v in values
+            ]
+            # collect data points that are unique for their label
+            solo_values = [
+                v[counts == 1] for values in self.data.values() for v in values
+            ]
+
+            stratify = label_ids[counts > 1]
+
+        output_values = train_test_split(
+            *multi_values,
+            test_size=number_of_test_examples,
+            random_state=random_seed,
+            stratify=stratify,
+        )
+
+        return self._convert_train_test_split(output_values, solo_values)
+
+    def get_signature(self) -> Dict[Text, List[FeatureSignature]]:
+        """Get signature of RasaModelData.
+
+        Signature stores the shape and whether features are sparse or not for every key.
+        """
+
+        return {
+            key: [
+                FeatureSignature(
+                    True if isinstance(v[0], scipy.sparse.spmatrix) else False,
+                    v[0].shape,
+                )
+                for v in values
+            ]
+            for key, values in self.data.items()
+        }
+
+    def as_tf_dataset(
+        self, batch_size: int, batch_strategy: Text = SEQUENCE, shuffle: bool = False
+    ) -> tf.data.Dataset:
+        """Create tf dataset."""
+
+        shapes, types = self._get_shapes_types()
+
+        return tf.data.Dataset.from_generator(
+            lambda batch_size_: self._gen_batch(batch_size_, batch_strategy, shuffle),
+            output_types=types,
+            output_shapes=shapes,
+            args=([batch_size]),
+        )
+
+    def prepare_batch(
+        self,
+        data: Optional[Data] = None,
+        start: Optional[int] = None,
+        end: Optional[int] = None,
+        tuple_sizes: Optional[Dict[Text, int]] = None,
+    ) -> Tuple[Optional[np.ndarray]]:
+        """Slices model data into batch using given start and end value."""
+
+        if not data:
+            data = self.data
+
+        batch_data = []
+
+        for key, values in data.items():
+            # add None for not present values during processing
+            if not values:
+                if tuple_sizes:
+                    batch_data += [None] * tuple_sizes[key]
+                else:
+                    batch_data.append(None)
+                continue
+
+            for v in values:
+                if start is not None and end is not None:
+                    _data = v[start:end]
+                elif start is not None:
+                    _data = v[start:]
+                elif end is not None:
+                    _data = v[:end]
+                else:
+                    _data = v[:]
+
+                if isinstance(_data[0], scipy.sparse.spmatrix):
+                    batch_data.extend(self._scipy_matrix_to_values(_data))
+                else:
+                    batch_data.append(self._pad_dense_data(_data))
+
+        # len of batch_data is equal to the number of keys in model data
+        return tuple(batch_data)
+
+    def _get_shapes_types(self) -> Tuple:
+        """Extract shapes and types from model data."""
+
+        types = []
+        shapes = []
+
+        def append_shape(features: np.ndarray) -> None:
+            if isinstance(features[0], scipy.sparse.spmatrix):
+                # scipy matrix is converted into indices, data, shape
+                shapes.append((None, features[0].ndim + 1))
+                shapes.append((None,))
+                shapes.append((features[0].ndim + 1))
+            elif features[0].ndim == 0:
+                shapes.append((None,))
+            elif features[0].ndim == 1:
+                shapes.append((None, features[0].shape[-1]))
+            else:
+                shapes.append((None, None, features[0].shape[-1]))
+
+        def append_type(features: np.ndarray) -> None:
+            if isinstance(features[0], scipy.sparse.spmatrix):
+                # scipy matrix is converted into indices, data, shape
+                types.append(tf.int64)
+                types.append(tf.float32)
+                types.append(tf.int64)
+            else:
+                types.append(tf.float32)
+
+        for values in self.data.values():
+            for v in values:
+                append_shape(v)
+                append_type(v)
+
+        return tuple(shapes), tuple(types)
+
+    def _shuffled_data(self, data: Data) -> Data:
+        """Shuffle model data."""
+
+        ids = np.random.permutation(self.num_examples)
+        return self._data_for_ids(data, ids)
+
+    def _balanced_data(self, data: Data, batch_size: int, shuffle: bool) -> Data:
+        """Mix model data to account for class imbalance.
+
+        This batching strategy puts rare classes approximately in every other batch,
+        by repeating them. Mimics stratified batching, but also takes into account
+        that more populated classes should appear more often.
+        """
+
+        self._check_label_key()
+
+        # skip balancing if labels are token based
+        if self.label_key is None or data[self.label_key][0][0].size > 1:
+            return data
+
+        label_ids = self._create_label_ids(data[self.label_key][0])
+
+        unique_label_ids, counts_label_ids = np.unique(
+            label_ids, return_counts=True, axis=0
+        )
+        num_label_ids = len(unique_label_ids)
+
+        # group data points by their label
+        # need to call every time, so that the data is shuffled inside each class
+        data_by_label = self._split_by_label_ids(data, label_ids, unique_label_ids)
+
+        # running index inside each data grouped by labels
+        data_idx = [0] * num_label_ids
+        # number of cycles each label was passed
+        num_data_cycles = [0] * num_label_ids
+        # if a label was skipped in current batch
+        skipped = [False] * num_label_ids
+
+        new_data = defaultdict(list)
+
+        while min(num_data_cycles) == 0:
+            if shuffle:
+                indices_of_labels = np.random.permutation(num_label_ids)
+            else:
+                indices_of_labels = range(num_label_ids)
+
+            for index in indices_of_labels:
+                if num_data_cycles[index] > 0 and not skipped[index]:
+                    skipped[index] = True
+                    continue
+                else:
+                    skipped[index] = False
+
+                index_batch_size = (
+                    int(counts_label_ids[index] / self.num_examples * batch_size) + 1
+                )
+
+                for k, values in data_by_label[index].items():
+                    for i, v in enumerate(values):
+                        if len(new_data[k]) < i + 1:
+                            new_data[k].append([])
+                        new_data[k][i].append(
+                            v[data_idx[index] : data_idx[index] + index_batch_size]
+                        )
+
+                data_idx[index] += index_batch_size
+                if data_idx[index] >= counts_label_ids[index]:
+                    num_data_cycles[index] += 1
+                    data_idx[index] = 0
+
+                if min(num_data_cycles) > 0:
+                    break
+
+        final_data = defaultdict(list)
+        for k, values in new_data.items():
+            for v in values:
+                final_data[k].append(np.concatenate(np.array(v)))
+
+        return final_data
+
+    def _gen_batch(
+        self, batch_size: int, batch_strategy: Text = SEQUENCE, shuffle: bool = False
+    ) -> Generator[Tuple[Optional[np.ndarray]], None, None]:
+        """Generate batches."""
+
+        data = self.data
+        num_examples = self.num_examples
+
+        if shuffle:
+            data = self._shuffled_data(data)
+
+        if batch_strategy == BALANCED:
+            data = self._balanced_data(data, batch_size, shuffle)
+            # after balancing, number of examples increased
+            num_examples = self.number_of_examples(data)
+
+        num_batches = num_examples // batch_size + int(num_examples % batch_size > 0)
+
+        for batch_num in range(num_batches):
+            start = batch_num * batch_size
+            end = start + batch_size
+
+            yield self.prepare_batch(data, start, end)
+
+    def _check_train_test_sizes(
+        self, number_of_test_examples: int, label_counts: Dict[Any, int]
+    ):
+        """Check whether the test data set is too large or too small."""
+
+        if number_of_test_examples >= self.num_examples - len(label_counts):
+            raise ValueError(
+                f"Test set of {number_of_test_examples} is too large. Remaining "
+                f"train set should be at least equal to number of classes "
+                f"{len(label_counts)}."
+            )
+        elif number_of_test_examples < len(label_counts):
+            raise ValueError(
+                f"Test set of {number_of_test_examples} is too small. It should "
+                f"be at least equal to number of classes {label_counts}."
+            )
+
+    @staticmethod
+    def _data_for_ids(data: Optional[Data], ids: np.ndarray) -> Data:
+        """Filter model data by ids."""
+
+        new_data = defaultdict(list)
+
+        if data is None:
+            return new_data
+
+        for k, values in data.items():
+            for v in values:
+                new_data[k].append(v[ids])
+        return new_data
+
+    def _split_by_label_ids(
+        self, data: Optional[Data], label_ids: np.ndarray, unique_label_ids: np.ndarray
+    ) -> List["RasaModelData"]:
+        """Reorganize model data into a list of model data with the same labels."""
+
+        label_data = []
+        for label_id in unique_label_ids:
+            matching_ids = label_ids == label_id
+            label_data.append(
+                RasaModelData(self.label_key, self._data_for_ids(data, matching_ids))
+            )
+        return label_data
+
+    def _check_label_key(self):
+        if self.label_key is not None and (
+            self.label_key not in self.data or len(self.data[self.label_key]) > 1
+        ):
+            raise ValueError(f"Key '{self.label_key}' not in RasaModelData.")
+
+    def _convert_train_test_split(
+        self, output_values: List[Any], solo_values: List[Any]
+    ) -> Tuple["RasaModelData", "RasaModelData"]:
+        """Converts the output of sklearn's train_test_split into model data."""
+
+        data_train = defaultdict(list)
+        data_val = defaultdict(list)
+
+        # output_values = x_train, x_val, y_train, y_val, z_train, z_val, etc.
+        # order is kept, e.g. same order as model data keys
+
+        # train datasets have an even index
+        index = 0
+        for key, values in self.data.items():
+            for _ in values:
+                data_train[key].append(
+                    self._combine_features(output_values[index * 2], solo_values[index])
+                )
+                index += 1
+
+        # val datasets have an odd index
+        index = 0
+        for key, values in self.data.items():
+            for _ in range(len(values)):
+                data_val[key].append(output_values[(index * 2) + 1])
+                index += 1
+
+        return (
+            RasaModelData(self.label_key, data_train),
+            RasaModelData(self.label_key, data_val),
+        )
+
+    @staticmethod
+    def _combine_features(
+        feature_1: Union[np.ndarray, scipy.sparse.spmatrix],
+        feature_2: Union[np.ndarray, scipy.sparse.spmatrix],
+    ) -> Union[np.ndarray, scipy.sparse.spmatrix]:
+        """Concatenate features."""
+
+        if isinstance(feature_1, scipy.sparse.spmatrix) and isinstance(
+            feature_2, scipy.sparse.spmatrix
+        ):
+            if feature_2.shape[0] == 0:
+                return feature_1
+            if feature_1.shape[0] == 0:
+                return feature_2
+            return scipy.sparse.vstack([feature_1, feature_2])
+
+        return np.concatenate([feature_1, feature_2])
+
+    @staticmethod
+    def _create_label_ids(label_ids: np.ndarray) -> np.ndarray:
+        """Convert various size label_ids into single dim array.
+
+        For multi-label y, map each distinct row to a string representation
+        using join because str(row) uses an ellipsis if len(row) > 1000.
+        Idea taken from sklearn's stratify split.
+        """
+
+        if label_ids.ndim == 1:
+            return label_ids
+
+        if label_ids.ndim == 2 and label_ids.shape[-1] == 1:
+            return label_ids[:, 0]
+
+        if label_ids.ndim == 2:
+            return np.array([" ".join(row.astype("str")) for row in label_ids])
+
+        if label_ids.ndim == 3 and label_ids.shape[-1] == 1:
+            return np.array([" ".join(row.astype("str")) for row in label_ids[:, :, 0]])
+
+        raise ValueError("Unsupported label_ids dimensions")
+
+    @staticmethod
+    def _pad_dense_data(array_of_dense: np.ndarray) -> np.ndarray:
+        """Pad data of different lengths.
+
+        Sequential data is padded with zeros. Zeros are added to the end of data.
+        """
+
+        if array_of_dense[0].ndim < 2:
+            # data doesn't contain a sequence
+            return array_of_dense.astype(np.float32)
+
+        data_size = len(array_of_dense)
+        max_seq_len = max([x.shape[0] for x in array_of_dense])
+
+        data_padded = np.zeros(
+            [data_size, max_seq_len, array_of_dense[0].shape[-1]],
+            dtype=array_of_dense[0].dtype,
+        )
+        for i in range(data_size):
+            data_padded[i, : array_of_dense[i].shape[0], :] = array_of_dense[i]
+
+        return data_padded.astype(np.float32)
+
+    @staticmethod
+    def _scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
+        """Convert a scipy matrix into indices, data, and shape."""
+
+        # we need to make sure that the matrices are coo_matrices otherwise the
+        # transformation does not work (e.g. you cannot access x.row, x.col)
+        if not isinstance(array_of_sparse[0], scipy.sparse.coo_matrix):
+            array_of_sparse = [x.tocoo() for x in array_of_sparse]
+
+        max_seq_len = max([x.shape[0] for x in array_of_sparse])
+
+        # get the indices of values
+        indices = np.hstack(
+            [
+                np.vstack([i * np.ones_like(x.row), x.row, x.col])
+                for i, x in enumerate(array_of_sparse)
+            ]
+        ).T
+
+        data = np.hstack([x.data for x in array_of_sparse])
+
+        number_of_features = array_of_sparse[0].shape[-1]
+        shape = np.array((len(array_of_sparse), max_seq_len, number_of_features))
+
+        return [
+            indices.astype(np.int64),
+            data.astype(np.float32),
+            shape.astype(np.int64),
+        ]
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
new file mode 100644
index 000000000000..48dafd731f21
--- /dev/null
+++ b/rasa/utils/tensorflow/models.py
@@ -0,0 +1,378 @@
+import tensorflow as tf
+import numpy as np
+import logging
+from collections import defaultdict
+from typing import List, Text, Dict, Tuple, Union, Optional, Callable
+from tqdm import tqdm
+from rasa.utils.common import is_logging_disabled
+from rasa.utils.tensorflow.model_data import RasaModelData, FeatureSignature
+from rasa.utils.tensorflow.constants import SEQUENCE
+
+logger = logging.getLogger(__name__)
+
+
+# noinspection PyMethodOverriding
+class RasaModel(tf.keras.models.Model):
+    """Completely override all public methods of keras Model.
+
+    Cannot be used as tf.keras.Model
+    """
+
+    def __init__(self, random_seed: Optional[int] = None, **kwargs) -> None:
+        """Initialize the RasaModel.
+
+        Args:
+            random_seed: set the random seed to get reproducible results
+        """
+        super().__init__(**kwargs)
+
+        self.total_loss = tf.keras.metrics.Mean(name="t_loss")
+        self.metrics_to_log = ["t_loss"]
+
+        self._training = None  # training phase should be defined when building a graph
+
+        self._predict_function = None
+
+        self.random_seed = random_seed
+
+    def batch_loss(
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
+    ) -> tf.Tensor:
+        raise NotImplementedError
+
+    def batch_predict(
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
+    ) -> Dict[Text, tf.Tensor]:
+        raise NotImplementedError
+
+    def fit(
+        self,
+        model_data: RasaModelData,
+        epochs: int,
+        batch_size: Union[List[int], int],
+        evaluate_on_num_examples: int,
+        evaluate_every_num_epochs: int,
+        batch_strategy: Text,
+        silent: bool = False,
+        eager: bool = False,
+    ) -> None:
+        """Fit model data"""
+
+        tf.random.set_seed(self.random_seed)
+        np.random.seed(self.random_seed)
+
+        disable = silent or is_logging_disabled()
+
+        evaluation_model_data = None
+        if evaluate_on_num_examples > 0:
+            if not disable:
+                logger.info(
+                    f"Validation accuracy is calculated every "
+                    f"{evaluate_every_num_epochs} epochs."
+                )
+
+            model_data, evaluation_model_data = model_data.split(
+                evaluate_on_num_examples, self.random_seed
+            )
+
+        (
+            train_dataset_function,
+            tf_train_on_batch_function,
+        ) = self._get_tf_train_functions(eager, model_data, batch_strategy)
+        (
+            evaluation_dataset_function,
+            tf_evaluation_on_batch_function,
+        ) = self._get_tf_evaluation_functions(eager, evaluation_model_data)
+
+        val_results = {}  # validation is not performed every epoch
+        progress_bar = tqdm(range(epochs), desc="Epochs", disable=disable)
+
+        for epoch in progress_bar:
+            epoch_batch_size = self.linearly_increasing_batch_size(
+                epoch, batch_size, epochs
+            )
+
+            self._batch_loop(
+                train_dataset_function,
+                tf_train_on_batch_function,
+                epoch_batch_size,
+                True,
+            )
+
+            postfix_dict = self._get_metric_results()
+
+            if evaluate_on_num_examples > 0:
+                if self._should_evaluate(evaluate_every_num_epochs, epochs, epoch):
+                    self._batch_loop(
+                        evaluation_dataset_function,
+                        tf_evaluation_on_batch_function,
+                        epoch_batch_size,
+                        False,
+                    )
+                    val_results = self._get_metric_results(prefix="val_")
+
+                postfix_dict.update(val_results)
+
+            progress_bar.set_postfix(postfix_dict)
+
+        self._training = None  # training phase should be defined when building a graph
+        if not disable:
+            logger.info("Finished training.")
+
+    def train_on_batch(
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
+    ) -> None:
+        """Train on batch"""
+
+        with tf.GradientTape() as tape:
+            total_loss = self._total_batch_loss(batch_in)
+
+        gradients = tape.gradient(total_loss, self.trainable_variables)
+        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
+
+    def build_for_predict(
+        self, predict_data: RasaModelData, eager: bool = False
+    ) -> None:
+        self._training = False  # needed for tf graph mode
+        self._predict_function = self._get_tf_call_model_function(
+            predict_data.as_tf_dataset, self.batch_predict, eager, "prediction"
+        )
+
+    def predict(self, predict_data: RasaModelData) -> Dict[Text, tf.Tensor]:
+        if self._predict_function is None:
+            logger.debug("There is no tensorflow prediction graph.")
+            self.build_for_predict(predict_data)
+
+        predict_dataset = predict_data.as_tf_dataset(batch_size=1)
+        batch_in = next(iter(predict_dataset))
+
+        self._training = False  # needed for eager mode
+        return self._predict_function(batch_in)
+
+    def save(self, model_file_name: Text) -> None:
+        self.save_weights(model_file_name, save_format="tf")
+
+    @classmethod
+    def load(
+        cls, model_file_name: Text, model_data_example: RasaModelData, *args, **kwargs
+    ) -> "RasaModel":
+        logger.debug("Loading the model ...")
+        # create empty model
+        model = cls(*args, **kwargs)
+        # need to train on 1 example to build weights of the correct size
+        model.fit(
+            model_data_example,
+            epochs=1,
+            batch_size=1,
+            evaluate_every_num_epochs=0,
+            evaluate_on_num_examples=0,
+            batch_strategy=SEQUENCE,
+            silent=True,  # don't confuse users with training output
+            eager=True,  # no need to build tf graph, eager is faster here
+        )
+        # load trained weights
+        model.load_weights(model_file_name)
+
+        logger.debug("Finished loading the model.")
+        return model
+
+    def _total_batch_loss(
+        self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
+    ) -> tf.Tensor:
+        """Calculate total loss"""
+
+        prediction_loss = self.batch_loss(batch_in)
+        regularization_loss = tf.math.add_n(self.losses)
+        total_loss = prediction_loss + regularization_loss
+        self.total_loss.update_state(total_loss)
+
+        return total_loss
+
+    def _batch_loop(
+        self,
+        dataset_function: Callable,
+        call_model_function: Callable,
+        batch_size: int,
+        training: bool,
+    ) -> None:
+        """Run on batches"""
+
+        self.reset_metrics()
+        self._training = training  # needed for eager mode
+        for batch_in in dataset_function(batch_size):
+            call_model_function(batch_in)
+
+    @staticmethod
+    def _get_tf_call_model_function(
+        dataset_function: Callable,
+        call_model_function: Callable,
+        eager: bool,
+        phase: Text,
+    ) -> Callable:
+        """Convert functions to tensorflow functions"""
+
+        if eager:
+            return call_model_function
+
+        logger.debug(f"Building tensorflow {phase} graph...")
+
+        init_dataset = dataset_function(1)
+        tf_call_model_function = tf.function(
+            call_model_function, input_signature=[init_dataset.element_spec]
+        )
+        tf_call_model_function(next(iter(init_dataset)))
+
+        logger.debug(f"Finished building tensorflow {phase} graph.")
+
+        return tf_call_model_function
+
+    def _get_tf_train_functions(
+        self, eager: bool, model_data: RasaModelData, batch_strategy: Text
+    ) -> Tuple[Callable, Callable]:
+        """Create train tensorflow functions"""
+
+        def train_dataset_function(_batch_size: int) -> tf.data.Dataset:
+            return model_data.as_tf_dataset(_batch_size, batch_strategy, shuffle=True)
+
+        self._training = True  # needed for tf graph mode
+        return (
+            train_dataset_function,
+            self._get_tf_call_model_function(
+                train_dataset_function, self.train_on_batch, eager, "train"
+            ),
+        )
+
+    def _get_tf_evaluation_functions(
+        self, eager: bool, evaluation_model_data: Optional[RasaModelData]
+    ) -> Tuple[Optional[Callable], Optional[Callable]]:
+        """Create evaluation tensorflow functions"""
+
+        if evaluation_model_data is None:
+            return None, None
+
+        def evaluation_dataset_function(_batch_size: int) -> tf.data.Dataset:
+            return evaluation_model_data.as_tf_dataset(
+                _batch_size, SEQUENCE, shuffle=False
+            )
+
+        self._training = False  # needed for tf graph mode
+        return (
+            evaluation_dataset_function,
+            self._get_tf_call_model_function(
+                evaluation_dataset_function, self._total_batch_loss, eager, "evaluation"
+            ),
+        )
+
+    def _get_metric_results(self, prefix: Optional[Text] = None) -> Dict[Text, Text]:
+        """Get the metrics results"""
+
+        prefix = prefix or ""
+
+        return {
+            f"{prefix}{metric.name}": f"{metric.result().numpy():.3f}"
+            for metric in self.metrics
+            if metric.name in self.metrics_to_log
+        }
+
+    @staticmethod
+    def _should_evaluate(
+        evaluate_every_num_epochs: int, epochs: int, current_epoch: int
+    ) -> bool:
+        return (
+            current_epoch == 0
+            or (current_epoch + 1) % evaluate_every_num_epochs == 0
+            or (current_epoch + 1) == epochs
+        )
+
+    @staticmethod
+    def batch_to_model_data_format(
+        batch: Union[Tuple[tf.Tensor], Tuple[np.ndarray]],
+        data_signature: Dict[Text, List[FeatureSignature]],
+    ) -> Dict[Text, List[tf.Tensor]]:
+        """Convert input batch tensors into batch data format.
+
+        Batch contains any number of batch data. The order is equal to the
+        key-value pairs in session data. As sparse data were converted into indices, data,
+        shape before, this methods converts them into sparse tensors. Dense data is
+        kept.
+        """
+
+        batch_data = defaultdict(list)
+
+        idx = 0
+        for k, signature in data_signature.items():
+            for is_sparse, shape in signature:
+                if is_sparse:
+                    # explicitly substitute last dimension in shape with known
+                    # static value
+                    batch_data[k].append(
+                        tf.SparseTensor(
+                            batch[idx],
+                            batch[idx + 1],
+                            [batch[idx + 2][0], batch[idx + 2][1], shape[-1]],
+                        )
+                    )
+                    idx += 3
+                else:
+                    if isinstance(batch[idx], tf.Tensor):
+                        batch_data[k].append(batch[idx])
+                    else:
+                        # convert to Tensor
+                        batch_data[k].append(tf.constant(batch[idx], dtype=tf.float32))
+                    idx += 1
+
+        return batch_data
+
+    @staticmethod
+    def linearly_increasing_batch_size(
+        epoch: int, batch_size: Union[List[int], int], epochs: int
+    ) -> int:
+        """Linearly increase batch size with every epoch.
+
+        The idea comes from https://arxiv.org/abs/1711.00489.
+        """
+
+        if not isinstance(batch_size, list):
+            return int(batch_size)
+
+        if epochs > 1:
+            return int(
+                batch_size[0] + epoch * (batch_size[1] - batch_size[0]) / (epochs - 1)
+            )
+        else:
+            return int(batch_size[0])
+
+    def compile(self, *args, **kwargs) -> None:
+        raise Exception(
+            "This method should neither be called nor implemented in our code."
+        )
+
+    def evaluate(self, *args, **kwargs) -> None:
+        raise Exception(
+            "This method should neither be called nor implemented in our code."
+        )
+
+    def test_on_batch(self, *args, **kwargs) -> None:
+        raise Exception(
+            "This method should neither be called nor implemented in our code."
+        )
+
+    def predict_on_batch(self, *args, **kwargs) -> None:
+        raise Exception(
+            "This method should neither be called nor implemented in our code."
+        )
+
+    def fit_generator(self, *args, **kwargs) -> None:
+        raise Exception(
+            "This method should neither be called nor implemented in our code."
+        )
+
+    def evaluate_generator(self, *args, **kwargs) -> None:
+        raise Exception(
+            "This method should neither be called nor implemented in our code."
+        )
+
+    def predict_generator(self, *args, **kwargs) -> None:
+        raise Exception(
+            "This method should neither be called nor implemented in our code."
+        )
diff --git a/rasa/utils/tensorflow/transformer.py b/rasa/utils/tensorflow/transformer.py
new file mode 100644
index 000000000000..c7a309db1c38
--- /dev/null
+++ b/rasa/utils/tensorflow/transformer.py
@@ -0,0 +1,520 @@
+from typing import List, Optional, Text, Tuple, Union
+import tensorflow as tf
+import tensorflow_addons as tfa
+from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.keras import backend as K
+import numpy as np
+from rasa.utils.tensorflow.layers import DenseWithSparseWeights
+
+
+# from https://www.tensorflow.org/tutorials/text/transformer
+# and https://github.com/tensorflow/tensor2tensor
+class MultiHeadAttention(tf.keras.layers.Layer):
+    def __init__(
+        self,
+        units: int,
+        num_heads: int,
+        attention_dropout_rate: float = 0.0,
+        sparsity: float = 0.8,
+        unidirectional: bool = False,
+        use_key_relative_position: bool = False,
+        use_value_relative_position: bool = False,
+        max_relative_position: Optional[int] = None,
+        heads_share_relative_embedding: bool = False,
+    ) -> None:
+        super().__init__()
+
+        if units % num_heads != 0:
+            raise ValueError(
+                f"number of units {units} should be proportional to "
+                f"number of attention heads {num_heads}."
+            )
+
+        self.num_heads = num_heads
+        self.units = units
+        self.attention_dropout_rate = attention_dropout_rate
+        self.unidirectional = unidirectional
+        self.use_key_relative_position = use_key_relative_position
+        self.use_value_relative_position = use_value_relative_position
+        self.relative_length = max_relative_position
+        if self.relative_length is not None:
+            self.relative_length += 1  # include current time
+        self.heads_share_relative_embedding = heads_share_relative_embedding
+
+        self._depth = units // self.num_heads
+
+        # process queries
+        self._wq = DenseWithSparseWeights(
+            units=units, use_bias=False, sparsity=sparsity
+        )
+        # process keys
+        self._wk = DenseWithSparseWeights(
+            units=units, use_bias=False, sparsity=sparsity
+        )
+        # process values
+        self._wv = DenseWithSparseWeights(
+            units=units, use_bias=False, sparsity=sparsity
+        )
+        # process attention output
+        self._dense = DenseWithSparseWeights(units=units, sparsity=sparsity)
+
+        self._create_relative_embeddings()
+
+    def _create_relative_embeddings(self) -> None:
+        """Create relative embeddings."""
+
+        relative_embedding_shape = None
+        self.key_relative_embeddings = None
+        self.value_relative_embeddings = None
+
+        if self.use_key_relative_position or self.use_value_relative_position:
+            if not self.relative_length:
+                raise ValueError(
+                    f"Max relative position {self.relative_length} "
+                    f"should be > 0 when using relative attention."
+                )
+
+            if self.unidirectional:
+                relative_length = self.relative_length
+            else:
+                relative_length = 2 * self.relative_length - 1
+
+            if self.heads_share_relative_embedding:
+                relative_embedding_shape = (relative_length, self._depth)
+            else:
+                relative_embedding_shape = (
+                    self.num_heads,
+                    relative_length,
+                    self._depth,
+                )
+
+        if self.use_key_relative_position:
+            self.key_relative_embeddings = self.add_weight(
+                shape=relative_embedding_shape, name="key_relative_embeddings"
+            )
+
+        if self.use_value_relative_position:
+            self.value_relative_embeddings = self.add_weight(
+                shape=relative_embedding_shape, name="value_relative_embeddings"
+            )
+
+    def _pad_relative_embeddings(self, x: tf.Tensor, length: tf.Tensor) -> tf.Tensor:
+        # pad the left side to length
+        pad_left = x[:, :, :, :1, :]
+        pad_left = tf.tile(pad_left, (1, 1, 1, length - self.relative_length, 1))
+
+        # pad the right side to length
+        if self.unidirectional:
+            right_relative_length = 1  # current time
+            pad_right = tf.zeros_like(x[:, :, :, -1:, :])
+        else:
+            right_relative_length = self.relative_length
+            pad_right = x[:, :, :, -1:, :]
+        pad_right = tf.tile(pad_right, (1, 1, 1, length - right_relative_length, 1))
+
+        return tf.concat([pad_left, x, pad_right], axis=-2)
+
+    def _slice_relative_embeddings(self, x: tf.Tensor, length: tf.Tensor) -> tf.Tensor:
+        if self.unidirectional:
+            # pad the right side to relative_length
+            pad_right = tf.zeros_like(x[:, :, :, -1:, :])
+            pad_right = tf.tile(pad_right, (1, 1, 1, self.relative_length - 1, 1))
+            x = tf.concat([x, pad_right], axis=-2)
+
+        extra_length = self.relative_length - length
+        full_length = tf.shape(x)[-2]
+        return x[:, :, :, extra_length : full_length - extra_length, :]
+
+    def _relative_to_absolute_position(self, x: tf.Tensor) -> tf.Tensor:
+        """Universal method to convert tensor from relative to absolute indexing.
+
+        x.shape =
+        (batch, num_heads, length, relative_length, depth)
+        or (batch, num_heads, length, relative_length)
+        "Slides" relative embeddings by 45 degree.
+        """
+
+        x_dim = len(x.shape)
+
+        if x_dim < 4 or x_dim > 5:
+            raise ValueError(
+                f"Relative tensor has a wrong shape {x.shape}, "
+                f"it should have 4 or 5 dimensions."
+            )
+        if x_dim == 4:
+            # add fake depth dimension
+            x = tf.expand_dims(x, axis=-1)
+
+        batch = tf.shape(x)[0]
+        num_heads = tf.shape(x)[1]
+        length = tf.shape(x)[2]
+        depth = tf.shape(x)[-1]
+
+        x = tf.cond(
+            length > self.relative_length,
+            lambda: self._pad_relative_embeddings(x, length),
+            lambda: self._slice_relative_embeddings(x, length),
+        )
+
+        # add a column of zeros to "slide" columns to diagonals through reshape
+        pad_shift = tf.zeros_like(x[:, :, :, -1:, :])
+        x = tf.concat([x, pad_shift], axis=-2)
+
+        # flatten length dimensions
+        x = tf.reshape(x, (batch, num_heads, -1, depth))
+        width = 2 * length
+
+        # add zeros so that the result of back reshape is still a matrix
+        pad_flat = tf.zeros_like(
+            x[:, :, : (width - 1) - width * length % (width - 1), :]
+        )
+        x = tf.concat([x, pad_flat], axis=-2)
+
+        # "slide" columns to diagonals through reshape
+        x = tf.reshape(x, (batch, num_heads, -1, width - 1, depth))
+
+        # slice needed "diagonal" matrix
+        x = x[:, :, :-1, -length:, :]
+
+        if x_dim == 4:
+            # remove fake depth dimension
+            x = tf.squeeze(x, axis=-1)
+
+        return x
+
+    def _matmul_with_relative_keys(self, x: tf.Tensor) -> tf.Tensor:
+        y = self.key_relative_embeddings
+
+        if self.heads_share_relative_embedding:
+            matmul = tf.einsum("bhld,md->bhlm", x, y)
+        else:
+            matmul = tf.einsum("bhld,hmd->bhlm", x, y)
+
+        return self._relative_to_absolute_position(matmul)
+
+    def _tile_relative_embeddings(self, x: tf.Tensor, length: tf.Tensor) -> tf.Tensor:
+        if self.heads_share_relative_embedding:
+            x = tf.expand_dims(x, axis=0)  # add head dimension
+
+        x = tf.expand_dims(x, axis=1)  # add length dimension
+        x = tf.tile(x, (1, length, 1, 1))
+        return tf.expand_dims(x, axis=0)  # add batch dimension
+
+    def _squeeze_relative_embeddings(self, x: tf.Tensor) -> tf.Tensor:
+        x = tf.squeeze(x, axis=0)  # squeeze batch dimension
+        if self.heads_share_relative_embedding:
+            x = tf.squeeze(x, axis=1)  # squeeze head dimension
+        return x
+
+    def _matmul_with_relative_values(self, x: tf.Tensor) -> tf.Tensor:
+        y = self._tile_relative_embeddings(
+            self.value_relative_embeddings, tf.shape(x)[-2]
+        )
+        y = self._relative_to_absolute_position(y)
+        y = self._squeeze_relative_embeddings(y)
+
+        if self.heads_share_relative_embedding:
+            return tf.einsum("bhlm,lmd->bhld", x, y)
+        else:
+            return tf.einsum("bhlm,hlmd->bhld", x, y)
+
+    def _drop_attention_logits(
+        self, logits: tf.Tensor, pad_mask: tf.Tensor, training: tf.Tensor
+    ) -> tf.Tensor:
+        def droped_logits() -> tf.Tensor:
+            keep_prob = tf.random.uniform(tf.shape(logits), 0, 1) + pad_mask
+            drop_mask = tf.cast(
+                tf.less(keep_prob, self.attention_dropout_rate), logits.dtype
+            )
+
+            return logits + drop_mask * -1e9
+
+        return tf_utils.smart_cond(training, droped_logits, lambda: tf.identity(logits))
+
+    def _scaled_dot_product_attention(
+        self,
+        q: tf.Tensor,
+        k: tf.Tensor,
+        v: tf.Tensor,
+        pad_mask: tf.Tensor,
+        training: tf.Tensor,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        """Calculate the attention weights.
+        q, k, v must have matching leading dimensions.
+        k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
+        The mask has different shapes depending on its type(padding or look ahead)
+        but it must be broadcastable for addition.
+
+        Args:
+          q: query shape == (..., seq_len_q, depth)
+          k: key shape == (..., seq_len_k, depth)
+          v: value shape == (..., seq_len_v, depth_v)
+          pad_mask: Float tensor with shape broadcastable
+                to (..., seq_len_q, seq_len_k). Defaults to None.
+
+        Returns:
+          output, attention_weights
+        """
+
+        matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
+
+        if self.use_key_relative_position:
+            matmul_qk += self._matmul_with_relative_keys(q)
+
+        # scale matmul_qk
+        dk = tf.cast(tf.shape(k)[-1], tf.float32)
+        logits = matmul_qk / tf.math.sqrt(dk)
+
+        # add the mask to the scaled tensor.
+        if pad_mask is not None:
+            logits += pad_mask * -1e9
+
+        # apply attention dropout before softmax to maintain attention_weights norm as 1
+        if self.attention_dropout_rate > 0:
+            logits = self._drop_attention_logits(logits, pad_mask, training)
+
+        # softmax is normalized on the last axis (seq_len_k) so that the scores
+        # add up to 1.
+        attention_weights = tf.nn.softmax(
+            logits, axis=-1
+        )  # (..., seq_len_q, seq_len_k)
+
+        output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
+        if self.use_value_relative_position:
+            output += self._matmul_with_relative_values(attention_weights)
+
+        return output, attention_weights
+
+    def _split_heads(self, x: tf.Tensor) -> tf.Tensor:
+        """Split the last dimension into (num_heads, depth).
+
+        Transpose the result such that the shape is
+        (batch_size, num_heads, seq_len, depth)
+        """
+
+        x = tf.reshape(x, (tf.shape(x)[0], -1, self.num_heads, self._depth))
+        return tf.transpose(x, perm=[0, 2, 1, 3])
+
+    def _combine_heads(self, x: tf.Tensor) -> tf.Tensor:
+        """Inverse of split_heads.
+
+        Args:
+          x: a Tensor with shape [batch, num_heads, length, channels / num_heads]
+
+        Returns:
+          a Tensor with shape [batch, length, channels]
+        """
+
+        # (batch_size, seq_len_q, num_heads, depth)
+        x = tf.transpose(x, perm=[0, 2, 1, 3])
+        # (batch_size, seq_len_q, units)
+        return tf.reshape(x, (tf.shape(x)[0], -1, self.units))
+
+    # noinspection PyMethodOverriding
+    def call(
+        self,
+        v: tf.Tensor,
+        k: tf.Tensor,
+        q: tf.Tensor,
+        pad_mask: Optional[tf.Tensor] = None,
+        training: Optional[Union[tf.Tensor, bool]] = None,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        if training is None:
+            training = K.learning_phase()
+
+        q = self._wq(q)  # (batch_size, seq_len_q, units)
+        k = self._wk(k)  # (batch_size, seq_len_k, units)
+        v = self._wv(v)  # (batch_size, seq_len_v, units)
+
+        q = self._split_heads(q)  # (batch_size, num_heads, seq_len_q, depth)
+        k = self._split_heads(k)  # (batch_size, num_heads, seq_len_k, depth)
+        v = self._split_heads(v)  # (batch_size, num_heads, seq_len_v, depth)
+
+        attention, attention_weights = self._scaled_dot_product_attention(
+            q, k, v, pad_mask, training
+        )
+        # attention.shape == (batch_size, num_heads, seq_len_q, depth)
+        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
+        attention = self._combine_heads(attention)  # (batch_size, seq_len_q, units)
+
+        output = self._dense(attention)  # (batch_size, seq_len_q, units)
+
+        return output, attention_weights
+
+
+class TransformerEncoderLayer(tf.keras.layers.Layer):
+    def __init__(
+        self,
+        units: int,
+        num_heads: int,
+        filter_units: int,
+        dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        sparsity: float = 0.8,
+        unidirectional: bool = False,
+        use_key_relative_position: bool = False,
+        use_value_relative_position: bool = False,
+        max_relative_position: Optional[int] = None,
+        heads_share_relative_embedding: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self._layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+        self._mha = MultiHeadAttention(
+            units,
+            num_heads,
+            attention_dropout_rate,
+            sparsity,
+            unidirectional,
+            use_key_relative_position,
+            use_value_relative_position,
+            max_relative_position,
+            heads_share_relative_embedding,
+        )
+        self._dropout = tf.keras.layers.Dropout(dropout_rate)
+
+        self._ffn_layers = [
+            tf.keras.layers.LayerNormalization(epsilon=1e-6),
+            DenseWithSparseWeights(
+                units=filter_units, activation=tfa.activations.gelu, sparsity=sparsity
+            ),  # (batch_size, seq_len, filter_units)
+            tf.keras.layers.Dropout(dropout_rate),
+            DenseWithSparseWeights(
+                units=units, sparsity=sparsity
+            ),  # (batch_size, seq_len, units)
+            tf.keras.layers.Dropout(dropout_rate),
+        ]
+
+    def call(
+        self,
+        x: tf.Tensor,
+        pad_mask: Optional[tf.Tensor] = None,
+        training: Optional[Union[tf.Tensor, bool]] = None,
+    ) -> tf.Tensor:
+        if training is None:
+            training = K.learning_phase()
+
+        x_norm = self._layer_norm(x)  # (batch_size, seq_len, units)
+        attn_out, _ = self._mha(
+            x_norm, x_norm, x_norm, pad_mask=pad_mask, training=training
+        )
+        attn_out = self._dropout(attn_out, training=training)
+        x += attn_out
+
+        ffn_out = x  # (batch_size, seq_len, units)
+        for layer in self._ffn_layers:
+            ffn_out = layer(ffn_out, training=training)
+        x += ffn_out
+
+        return x  # (batch_size, seq_len, units)
+
+
+class TransformerEncoder(tf.keras.layers.Layer):
+    def __init__(
+        self,
+        num_layers: int,
+        units: int,
+        num_heads: int,
+        filter_units: int,
+        reg_lambda: float,
+        dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        sparsity: float = 0.8,
+        unidirectional: bool = False,
+        use_key_relative_position: bool = False,
+        use_value_relative_position: bool = False,
+        max_relative_position: Optional[int] = None,
+        heads_share_relative_embedding: bool = False,
+        name: Optional[Text] = None,
+    ) -> None:
+        super().__init__(name=name)
+
+        self.units = units
+        self.unidirectional = unidirectional
+
+        l2_regularizer = tf.keras.regularizers.l2(reg_lambda)
+        self._embedding = DenseWithSparseWeights(
+            units=units, kernel_regularizer=l2_regularizer, sparsity=sparsity
+        )
+        # positional encoding helpers
+        self._angles = self._get_angles()
+        self._even_indices = np.arange(0, self.units, 2, dtype=np.int32)[:, np.newaxis]
+        self._odd_indices = np.arange(1, self.units, 2, dtype=np.int32)[:, np.newaxis]
+
+        self._dropout = tf.keras.layers.Dropout(dropout_rate)
+
+        self._enc_layers = [
+            TransformerEncoderLayer(
+                units,
+                num_heads,
+                filter_units,
+                dropout_rate,
+                attention_dropout_rate,
+                sparsity,
+                unidirectional,
+                use_key_relative_position,
+                use_value_relative_position,
+                max_relative_position,
+                heads_share_relative_embedding,
+            )
+            for _ in range(num_layers)
+        ]
+        self._layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+
+    def _get_angles(self) -> np.ndarray:
+        i = np.arange(self.units)[np.newaxis, :]
+        return 1 / np.power(10000, (2 * (i // 2)) / np.float32(self.units))
+
+    def _positional_encoding(self, max_position: tf.Tensor) -> tf.Tensor:
+        max_position = tf.cast(max_position, dtype=tf.float32)
+        angle_rads = tf.range(max_position)[:, tf.newaxis] * self._angles
+
+        # transpose for easy slicing
+        angle_rads = tf.transpose(angle_rads, perm=[1, 0])
+        shape = tf.shape(angle_rads)
+        # apply sin to even indices in the array; 2i
+        sin_even = tf.sin(tf.gather_nd(angle_rads, self._even_indices))
+        pos_encoding_even = tf.scatter_nd(self._even_indices, sin_even, shape)
+        # apply cos to odd indices in the array; 2i+1
+        cos_odd = tf.cos(tf.gather_nd(angle_rads, self._odd_indices))
+        pos_encoding_odd = tf.scatter_nd(self._odd_indices, cos_odd, shape)
+        # combine even and odd positions and transpose back
+        pos_encoding = tf.transpose(pos_encoding_even + pos_encoding_odd, perm=[1, 0])
+        # add batch dimension
+        return tf.stop_gradient(pos_encoding[tf.newaxis, ...])
+
+    @staticmethod
+    def _look_ahead_pad_mask(max_position: tf.Tensor) -> tf.Tensor:
+        pad_mask = 1 - tf.linalg.band_part(tf.ones((max_position, max_position)), -1, 0)
+        return pad_mask[tf.newaxis, tf.newaxis, :, :]  # (1, 1, seq_len, seq_len)
+
+    def call(
+        self,
+        x: tf.Tensor,
+        pad_mask: Optional[tf.Tensor] = None,
+        training: Optional[Union[tf.Tensor, bool]] = None,
+    ) -> tf.Tensor:
+
+        # adding embedding and position encoding.
+        x = self._embedding(x)  # (batch_size, seq_len, units)
+        x *= tf.math.sqrt(tf.cast(self.units, tf.float32))
+        x += self._positional_encoding(tf.shape(x)[1])
+        x = self._dropout(x, training=training)
+
+        if pad_mask is not None:
+            pad_mask = tf.squeeze(pad_mask, -1)  # (batch_size, seq_len)
+            pad_mask = pad_mask[:, tf.newaxis, tf.newaxis, :]
+            # pad_mask.shape = (batch_size, 1, 1, seq_len)
+            if self.unidirectional:
+                # add look ahead pad mask to emulate unidirectional behavior
+                pad_mask = tf.minimum(
+                    1.0, pad_mask + self._look_ahead_pad_mask(tf.shape(pad_mask)[-1])
+                )  # (batch_size, 1, seq_len, seq_len)
+
+        for layer in self._enc_layers:
+            x = layer(x, pad_mask=pad_mask, training=training)
+
+        # if normalization is done in encoding layers, then it should also be done
+        # on the output, since the output can grow very large, being the sum of
+        # a whole stack of unnormalized layer outputs.
+        return self._layer_norm(x)  # (batch_size, seq_len, units)
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index af2182980d28..62338631929f 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1,1294 +1,236 @@
-from collections import defaultdict
+import numpy as np
 import logging
 import scipy.sparse
-import typing
-from typing import List, Optional, Text, Dict, Tuple, Union, Generator, Callable, Any
-import numpy as np
-from tqdm import tqdm
-from sklearn.model_selection import train_test_split
-import tensorflow as tf
-from tensor2tensor.models.transformer import (
-    transformer_base,
-    transformer_prepare_encoder,
-    transformer_encoder,
+from typing import Optional, Text, Dict, Any, Union, List
+from rasa.core.constants import DIALOGUE
+from rasa.nlu.constants import TEXT
+from rasa.nlu.tokenizers.tokenizer import Token
+import rasa.utils.io as io_utils
+from rasa.utils.tensorflow.constants import (
+    LABEL,
+    HIDDEN_LAYERS_SIZES,
+    NUM_TRANSFORMER_LAYERS,
+    NUM_HEADS,
+    DENSE_DIMENSION,
+    LOSS_TYPE,
+    SIMILARITY_TYPE,
+    NUM_NEG,
+    EVAL_NUM_EXAMPLES,
+    EVAL_NUM_EPOCHS,
+    REGULARIZATION_CONSTANT,
+    USE_MAX_NEG_SIM,
+    MAX_NEG_SIM,
+    MAX_POS_SIM,
+    EMBEDDING_DIMENSION,
+    DROP_RATE_DIALOGUE,
+    DROP_RATE_LABEL,
+    NEGATIVE_MARGIN_SCALE,
+    DROP_RATE,
+    EPOCHS,
+    SOFTMAX,
+    MARGIN,
+    AUTO,
+    INNER,
+    COSINE,
 )
-from tensor2tensor.layers.common_attention import large_compatible_negative
-from rasa.utils.common import is_logging_disabled
-
 
-if typing.TYPE_CHECKING:
-    from tensor2tensor.utils.hparam import HParams
-
-# avoid warning println on contrib import - remove for tf 2
-tf.contrib._warning = None
 
 logger = logging.getLogger(__name__)
 
 
-# type for all tf session related data
-SessionDataType = Dict[Text, List[np.ndarray]]
-
-
-def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto]:
-    """Prepare `tf.compat.v1.ConfigProto` for training"""
-
-    if config.get("tf_config") is not None:
-        return tf.compat.v1.ConfigProto(**config.pop("tf_config"))
-    else:
-        return None
-
-
-def create_label_ids(label_ids: "np.ndarray") -> "np.ndarray":
-    """Convert various size label_ids into single dim array.
-
-    for multi-label y, map each distinct row to a string repr
-    using join because str(row) uses an ellipsis if len(row) > 1000.
-    Idea taken from sklearn's stratify split.
+def normalize(values: np.ndarray, ranking_length: Optional[int] = 0) -> np.ndarray:
+    """Normalizes an array of positive numbers over the top `ranking_length` values.
+    Other values will be set to 0.
     """
 
-    if label_ids.ndim == 1:
-        return label_ids
-
-    if label_ids.ndim == 2 and label_ids.shape[-1] == 1:
-        return label_ids[:, 0]
-
-    if label_ids.ndim == 2:
-        return np.array([" ".join(row.astype("str")) for row in label_ids])
-
-    if label_ids.ndim == 3 and label_ids.shape[-1] == 1:
-        return np.array([" ".join(row.astype("str")) for row in label_ids[:, :, 0]])
-
-    raise ValueError("Unsupported label_ids dimensions")
-
-
-# noinspection PyPep8Naming
-def train_val_split(
-    session_data: SessionDataType,
-    evaluate_on_num_examples: int,
-    random_seed: int,
-    label_key: Text,
-) -> Tuple[SessionDataType, SessionDataType]:
-    """Create random hold out validation set using stratified split."""
-
-    if label_key not in session_data or len(session_data[label_key]) > 1:
-        raise ValueError(f"Key '{label_key}' not in SessionData.")
-
-    label_ids = create_label_ids(session_data[label_key][0])
-
-    label_counts = dict(zip(*np.unique(label_ids, return_counts=True, axis=0)))
-
-    check_train_test_sizes(evaluate_on_num_examples, label_counts, session_data)
-
-    counts = np.array([label_counts[label] for label in label_ids])
-
-    multi_values = [v[counts > 1] for values in session_data.values() for v in values]
-
-    solo_values = [v[counts == 1] for values in session_data.values() for v in values]
-
-    output_values = train_test_split(
-        *multi_values,
-        test_size=evaluate_on_num_examples,
-        random_state=random_seed,
-        stratify=label_ids[counts > 1],
-    )
-
-    session_data_train, session_data_val = convert_train_test_split(
-        output_values, session_data, solo_values
-    )
-
-    return session_data_train, session_data_val
-
-
-def check_train_test_sizes(
-    evaluate_on_num_examples: int,
-    label_counts: Dict[Any, int],
-    session_data: SessionDataType,
-):
-    """Check whether the evaluation data set is too large or too small."""
-
-    num_examples = get_number_of_examples(session_data)
-
-    if evaluate_on_num_examples >= num_examples - len(label_counts):
-        raise ValueError(
-            f"Validation set of {evaluate_on_num_examples} is too large. Remaining "
-            f"train set should be at least equal to number of classes "
-            f"{len(label_counts)}."
-        )
-    elif evaluate_on_num_examples < len(label_counts):
-        raise ValueError(
-            f"Validation set of {evaluate_on_num_examples} is too small. It should be "
-            "at least equal to number of classes {label_counts}."
-        )
-
-
-def convert_train_test_split(
-    output_values: List[Any], session_data: SessionDataType, solo_values: List[Any]
-) -> Tuple[SessionDataType, SessionDataType]:
-    """Convert the output of sklearn.model_selection.train_test_split into train and
-    eval session data."""
-
-    session_data_train = defaultdict(list)
-    session_data_val = defaultdict(list)
-
-    # output_values = x_train, x_val, y_train, y_val, z_train, z_val, etc.
-    # order is kept, e.g. same order as session data keys
-
-    # train datasets have an even index
-    index = 0
-    for key, values in session_data.items():
-        for _ in range(len(values)):
-            session_data_train[key].append(
-                combine_features(output_values[index * 2], solo_values[index])
-            )
-            index += 1
-
-    # val datasets have an odd index
-    index = 0
-    for key, values in session_data.items():
-        for _ in range(len(values)):
-            session_data_val[key].append(output_values[(index * 2) + 1])
-            index += 1
-
-    return session_data_train, session_data_val
-
-
-def combine_features(
-    feature_1: Union[np.ndarray, scipy.sparse.spmatrix],
-    feature_2: Union[np.ndarray, scipy.sparse.spmatrix],
-) -> Union[np.ndarray, scipy.sparse.spmatrix]:
-    """Concatenate features."""
-
-    if isinstance(feature_1, scipy.sparse.spmatrix) and isinstance(
-        feature_2, scipy.sparse.spmatrix
-    ):
-        if feature_2.shape[0] == 0:
-            return feature_1
-        if feature_1.shape[0] == 0:
-            return feature_2
-        return scipy.sparse.vstack([feature_1, feature_2])
-
-    return np.concatenate([feature_1, feature_2])
-
-
-def shuffle_session_data(session_data: SessionDataType) -> SessionDataType:
-    """Shuffle session data."""
-
-    data_points = get_number_of_examples(session_data)
-    ids = np.random.permutation(data_points)
-    return session_data_for_ids(session_data, ids)
-
-
-def session_data_for_ids(session_data: SessionDataType, ids: np.ndarray):
-    """Filter session data by ids."""
-
-    new_session_data = defaultdict(list)
-    for k, values in session_data.items():
-        for v in values:
-            new_session_data[k].append(v[ids])
-    return new_session_data
-
-
-def split_session_data_by_label_ids(
-    session_data: SessionDataType,
-    label_ids: "np.ndarray",
-    unique_label_ids: "np.ndarray",
-) -> List[SessionDataType]:
-    """Reorganize session data into a list of session data with the same labels."""
+    new_values = values.copy()  # prevent mutation of the input
+    if 0 < ranking_length < len(new_values):
+        ranked = sorted(new_values, reverse=True)
+        new_values[new_values < ranked[ranking_length - 1]] = 0
 
-    label_data = []
-    for label_id in unique_label_ids:
-        ids = label_ids == label_id
-        label_data.append(session_data_for_ids(session_data, ids))
-    return label_data
+    if np.sum(new_values) > 0:
+        new_values = new_values / np.sum(new_values)
 
+    return new_values
 
-# noinspection PyPep8Naming
-def balance_session_data(
-    session_data: SessionDataType, batch_size: int, shuffle: bool, label_key: Text
-) -> SessionDataType:
-    """Mix session data to account for class imbalance.
 
-    This batching strategy puts rare classes approximately in every other batch,
-    by repeating them. Mimics stratified batching, but also takes into account
-    that more populated classes should appear more often.
+def update_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
     """
+    If SIMILARITY_TYPE is set to 'auto', update the SIMILARITY_TYPE depending
+    on the LOSS_TYPE.
+    Args:
+        config: model configuration
 
-    if label_key not in session_data or len(session_data[label_key]) > 1:
-        raise ValueError(f"Key '{label_key}' not in SessionDataType.")
-
-    label_ids = create_label_ids(session_data[label_key][0])
-
-    unique_label_ids, counts_label_ids = np.unique(
-        label_ids, return_counts=True, axis=0
-    )
-    num_label_ids = len(unique_label_ids)
-
-    # need to call every time, so that the data is shuffled inside each class
-    label_data = split_session_data_by_label_ids(
-        session_data, label_ids, unique_label_ids
-    )
-
-    data_idx = [0] * num_label_ids
-    num_data_cycles = [0] * num_label_ids
-    skipped = [False] * num_label_ids
-
-    new_session_data = defaultdict(list)
-    num_examples = get_number_of_examples(session_data)
-
-    while min(num_data_cycles) == 0:
-        if shuffle:
-            indices_of_labels = np.random.permutation(num_label_ids)
-        else:
-            indices_of_labels = range(num_label_ids)
-
-        for index in indices_of_labels:
-            if num_data_cycles[index] > 0 and not skipped[index]:
-                skipped[index] = True
-                continue
-            else:
-                skipped[index] = False
-
-            index_batch_size = (
-                int(counts_label_ids[index] / num_examples * batch_size) + 1
-            )
-
-            for k, values in label_data[index].items():
-                for i, v in enumerate(values):
-                    if len(new_session_data[k]) < i + 1:
-                        new_session_data[k].append([])
-                    new_session_data[k][i].append(
-                        v[data_idx[index] : data_idx[index] + index_batch_size]
-                    )
-
-            data_idx[index] += index_batch_size
-            if data_idx[index] >= counts_label_ids[index]:
-                num_data_cycles[index] += 1
-                data_idx[index] = 0
-
-            if min(num_data_cycles) > 0:
-                break
-
-    final_session_data = defaultdict(list)
-    for k, values in new_session_data.items():
-        for v in values:
-            final_session_data[k].append(np.concatenate(np.array(v)))
-
-    return final_session_data
-
-
-def get_number_of_examples(session_data: SessionDataType) -> int:
-    """Obtain number of examples in session data.
-
-    Raise a ValueError if number of examples differ for different data in session data.
+    Returns: updated model configuration
     """
+    if config.get(SIMILARITY_TYPE) == AUTO:
+        if config[LOSS_TYPE] == SOFTMAX:
+            config[SIMILARITY_TYPE] = INNER
+        elif config[LOSS_TYPE] == MARGIN:
+            config[SIMILARITY_TYPE] = COSINE
 
-    example_lengths = [v.shape[0] for values in session_data.values() for v in values]
-
-    # check if number of examples is the same for all values
-    if not all(length == example_lengths[0] for length in example_lengths):
-        raise ValueError(
-            f"Number of examples differs for keys '{session_data.keys()}'. Number of "
-            f"examples should be the same for all data in session data."
-        )
-
-    return example_lengths[0]
-
-
-def gen_batch(
-    session_data: SessionDataType,
-    batch_size: int,
-    label_key: Text,
-    batch_strategy: Text = "sequence",
-    shuffle: bool = False,
-) -> Generator[Tuple, None, None]:
-    """Generate batches."""
-
-    if shuffle:
-        session_data = shuffle_session_data(session_data)
-
-    if batch_strategy == "balanced":
-        session_data = balance_session_data(
-            session_data, batch_size, shuffle, label_key
-        )
-
-    num_examples = get_number_of_examples(session_data)
-    num_batches = num_examples // batch_size + int(num_examples % batch_size > 0)
-
-    for batch_num in range(num_batches):
-        start = batch_num * batch_size
-        end = start + batch_size
-
-        yield prepare_batch(session_data, start, end)
-
+    return config
 
-def prepare_batch(
-    session_data: SessionDataType,
-    start: Optional[int] = None,
-    end: Optional[int] = None,
-    tuple_sizes: Optional[Dict[Text, int]] = None,
-) -> Tuple[Optional[np.ndarray]]:
-    """Slices session data into batch using given start and end value."""
 
-    batch_data = []
+def align_tokens(
+    tokens_in: List[Text], token_end: int, token_start: int
+) -> List[Token]:
+    """Align sub-tokens of Language model with tokens return by the WhitespaceTokenizer.
 
-    for key, values in session_data.items():
-        # add None for not present values during processing
-        if not values:
-            if tuple_sizes:
-                batch_data += [None] * tuple_sizes[key]
-            else:
-                batch_data.append(None)
-            continue
-
-        for v in values:
-            if start is not None and end is not None:
-                _data = v[start:end]
-            elif start is not None:
-                _data = v[start:]
-            elif end is not None:
-                _data = v[:end]
-            else:
-                _data = v[:]
-
-            if isinstance(_data[0], scipy.sparse.spmatrix):
-                batch_data.extend(scipy_matrix_to_values(_data))
-            else:
-                batch_data.append(pad_dense_data(_data))
-
-    # len of batch_data is equal to the number of keys in session data
-    return tuple(batch_data)
-
-
-def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
-    """Convert a scipy matrix into inidces, data, and shape."""
-
-    if not isinstance(array_of_sparse[0], scipy.sparse.coo_matrix):
-        array_of_sparse = [x.tocoo() for x in array_of_sparse]
-
-    max_seq_len = max([x.shape[0] for x in array_of_sparse])
-
-    indices = np.hstack(
-        [
-            np.vstack([i * np.ones_like(x.row), x.row, x.col])
-            for i, x in enumerate(array_of_sparse)
-        ]
-    ).T
-    data = np.hstack([x.data for x in array_of_sparse])
-
-    shape = np.array((len(array_of_sparse), max_seq_len, array_of_sparse[0].shape[-1]))
-
-    return [indices.astype(np.int64), data.astype(np.float32), shape.astype(np.int64)]
-
-
-def pad_dense_data(array_of_dense: np.ndarray) -> np.ndarray:
-    """Pad data of different lengths.
-
-    Sequential data is padded with zeros. Zeros are added to the end of data.
+    As a language model might split a single word into multiple tokens, we need to make
+    sure that the start and end value of first and last sub-token matches the
+    start and end value of the token return by the WhitespaceTokenizer as the
+    entities are using those start and end values.
     """
 
-    if array_of_dense[0].ndim < 2:
-        # data doesn't contain a sequence
-        return array_of_dense
-
-    data_size = len(array_of_dense)
-    max_seq_len = max([x.shape[0] for x in array_of_dense])
-
-    data_padded = np.zeros(
-        [data_size, max_seq_len, array_of_dense[0].shape[-1]],
-        dtype=array_of_dense[0].dtype,
-    )
-    for i in range(data_size):
-        data_padded[i, : array_of_dense[i].shape[0], :] = array_of_dense[i]
-
-    return data_padded.astype(np.float32)
+    tokens_out = []
 
+    current_token_offset = token_start
 
-def batch_to_session_data(
-    batch: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], session_data: SessionDataType
-) -> Tuple[Dict[Text, List[tf.Tensor]], Dict[Text, int]]:
-    """Convert input batch tensors into batch data format.
-
-    Batch contains any number of batch data. The order is equal to the
-    key-value pairs in session data. As sparse data were converted into indices, data,
-    shape before, this methods converts them into sparse tensors. Dense data is
-    kept.
-    """
-
-    batch_data = defaultdict(list)
-    # save the amount of placeholders attributed to session data keys
-    tuple_sizes = defaultdict(int)
-
-    idx = 0
-    for k, values in session_data.items():
-        tuple_sizes[k] = 0
-        for v in values:
-            if isinstance(v[0], scipy.sparse.spmatrix):
-                # explicitly substitute last dimension in shape with known static value
-                batch_data[k].append(
-                    tf.SparseTensor(
-                        batch[idx],
-                        batch[idx + 1],
-                        [batch[idx + 2][0], batch[idx + 2][1], v[0].shape[-1]],
-                    )
-                )
-                tuple_sizes[k] += 3
-                idx += 3
+    for index, string in enumerate(tokens_in):
+        if index == 0:
+            if index == len(tokens_in) - 1:
+                s_token_end = token_end
             else:
-                batch_data[k].append(batch[idx])
-                tuple_sizes[k] += 1
-                idx += 1
-
-    return batch_data, tuple_sizes
-
-
-def create_tf_dataset(
-    session_data: SessionDataType,
-    batch_size: Union["tf.Tensor", int],
-    label_key: Text,
-    batch_strategy: Text = "sequence",
-    shuffle: bool = False,
-) -> "tf.data.Dataset":
-    """Create tf dataset."""
-
-    shapes, types = get_shapes_types(session_data)
-
-    return tf.data.Dataset.from_generator(
-        lambda batch_size_: gen_batch(
-            session_data, batch_size_, label_key, batch_strategy, shuffle
-        ),
-        output_types=types,
-        output_shapes=shapes,
-        args=([batch_size]),
-    )
-
-
-def get_shapes_types(session_data: SessionDataType) -> Tuple:
-    """Extract shapes and types from session data."""
-
-    types = []
-    shapes = []
-
-    def append_shape(v: np.ndarray):
-        if isinstance(v[0], scipy.sparse.spmatrix):
-            # scipy matrix is converted into indices, data, shape
-            shapes.append((None, v[0].ndim + 1))
-            shapes.append((None,))
-            shapes.append((v[0].ndim + 1))
-        elif v[0].ndim == 0:
-            shapes.append((None,))
-        elif v[0].ndim == 1:
-            shapes.append((None, v[0].shape[-1]))
+                s_token_end = current_token_offset + len(string)
+            tokens_out.append(Token(string, token_start, end=s_token_end))
+        elif index == len(tokens_in) - 1:
+            tokens_out.append(Token(string, current_token_offset, end=token_end))
         else:
-            shapes.append((None, None, v[0].shape[-1]))
-
-    def append_type(v: np.ndarray):
-        if isinstance(v[0], scipy.sparse.spmatrix):
-            # scipy matrix is converted into indices, data, shape
-            types.append(tf.int64)
-            types.append(tf.float32)
-            types.append(tf.int64)
-        else:
-            types.append(tf.float32)
-
-    for values in session_data.values():
-        for v in values:
-            append_shape(v)
-            append_type(v)
-
-    return tuple(shapes), tuple(types)
-
-
-def create_iterator_init_datasets(
-    session_data: SessionDataType,
-    eval_session_data: SessionDataType,
-    batch_size: Union["tf.Tensor", int],
-    batch_strategy: Text,
-    label_key: Text,
-) -> Tuple["tf.data.Iterator", "tf.Operation", "tf.Operation"]:
-    """Create iterator and init datasets."""
-
-    train_dataset = create_tf_dataset(
-        session_data,
-        batch_size,
-        label_key=label_key,
-        batch_strategy=batch_strategy,
-        shuffle=True,
-    )
-
-    iterator = tf.data.Iterator.from_structure(
-        train_dataset.output_types, train_dataset.output_shapes
-    )
-
-    train_init_op = iterator.make_initializer(train_dataset)
-
-    if eval_session_data is not None:
-        eval_init_op = iterator.make_initializer(
-            create_tf_dataset(eval_session_data, batch_size, label_key=label_key)
-        )
-    else:
-        eval_init_op = None
-
-    return iterator, train_init_op, eval_init_op
-
-
-# noinspection PyPep8Naming
-def tf_dense_layer_for_sparse(
-    inputs: tf.SparseTensor,
-    units: int,
-    name: Text,
-    C2: float,
-    activation: Optional[Callable] = tf.nn.relu,
-    use_bias: bool = True,
-) -> tf.Tensor:
-    """Dense layer for sparse input tensor"""
-
-    if not isinstance(inputs, tf.SparseTensor):
-        raise ValueError("Input tensor should be sparse.")
-
-    with tf.variable_scope("dense_layer_for_sparse_" + name, reuse=tf.AUTO_REUSE):
-        kernel_regularizer = tf.contrib.layers.l2_regularizer(C2)
-        kernel = tf.get_variable(
-            "kernel",
-            shape=[inputs.shape[-1], units],
-            dtype=inputs.dtype,
-            regularizer=kernel_regularizer,
-        )
-        bias = tf.get_variable("bias", shape=[units], dtype=inputs.dtype)
-
-        # outputs will be 2D
-        outputs = tf.sparse.matmul(
-            tf.sparse.reshape(inputs, [-1, int(inputs.shape[-1])]), kernel
-        )
-
-        if len(inputs.shape) == 3:
-            # reshape back
-            outputs = tf.reshape(
-                outputs, (tf.shape(inputs)[0], tf.shape(inputs)[1], -1)
+            tokens_out.append(
+                Token(
+                    string, current_token_offset, end=current_token_offset + len(string)
+                )
             )
 
-        if use_bias:
-            outputs = tf.nn.bias_add(outputs, bias)
-
-    if activation is None:
-        return outputs
+        current_token_offset += len(string)
 
-    return activation(outputs)
+    return tokens_out
 
 
-# noinspection PyPep8Naming
-def create_tf_fnn(
-    x_in: "tf.Tensor",
-    layer_sizes: List[int],
-    droprate: float,
-    C2: float,
-    is_training: "tf.Tensor",
-    layer_name_suffix: Text,
-    activation: Optional[Callable] = tf.nn.relu,
-    use_bias: bool = True,
-    kernel_initializer: Optional["tf.keras.initializers.Initializer"] = None,
-) -> "tf.Tensor":
-    """Create nn with hidden layers and name suffix."""
-
-    reg = tf.contrib.layers.l2_regularizer(C2)
-    x = tf.nn.relu(x_in)
-    for i, layer_size in enumerate(layer_sizes):
-        x = tf.layers.dense(
-            inputs=x,
-            units=layer_size,
-            activation=activation,
-            use_bias=use_bias,
-            kernel_initializer=kernel_initializer,
-            kernel_regularizer=reg,
-            name=f"hidden_layer_{layer_name_suffix}_{i}",
-            reuse=tf.AUTO_REUSE,
-        )
-        x = tf.layers.dropout(x, rate=droprate, training=is_training)
-    return x
-
-
-def tf_normalize_if_cosine(x: "tf.Tensor", similarity_type: Text) -> "tf.Tensor":
-    """Normalize embedding if similarity type is cosine."""
-
-    if similarity_type == "cosine":
-        return tf.nn.l2_normalize(x, -1)
-    elif similarity_type == "inner":
-        return x
-    else:
-        raise ValueError(
-            f"Wrong similarity type '{similarity_type}', "
-            f"should be 'cosine' or 'inner'"
-        )
-
-
-# noinspection PyPep8Naming
-def create_tf_embed(
-    x: "tf.Tensor",
-    embed_dim: int,
-    C2: float,
-    similarity_type: Text,
-    layer_name_suffix: Text,
-) -> "tf.Tensor":
-    """Create dense embedding layer with a name."""
-
-    reg = tf.contrib.layers.l2_regularizer(C2)
-    embed_x = tf.layers.dense(
-        inputs=x,
-        units=embed_dim,
-        activation=None,
-        kernel_regularizer=reg,
-        name=f"embed_layer_{layer_name_suffix}",
-        reuse=tf.AUTO_REUSE,
-    )
-    # normalize embedding vectors for cosine similarity
-    return tf_normalize_if_cosine(embed_x, similarity_type)
-
-
-def create_t2t_hparams(
-    num_transformer_layers: int,
-    transformer_size: int,
-    num_heads: int,
-    droprate: float,
-    pos_encoding: Text,
-    max_seq_length: int,
-    is_training: "tf.Tensor",
-) -> "HParams":
-    """Create parameters for t2t transformer."""
-
-    hparams = transformer_base()
-
-    hparams.num_hidden_layers = num_transformer_layers
-    hparams.hidden_size = transformer_size
-    # it seems to be factor of 4 for transformer architectures in t2t
-    hparams.filter_size = hparams.hidden_size * 4
-    hparams.num_heads = num_heads
-    hparams.relu_dropout = droprate
-    hparams.pos = pos_encoding
-
-    hparams.max_length = max_seq_length
-
-    hparams.unidirectional_encoder = True
-
-    hparams.self_attention_type = "dot_product_relative_v2"
-    hparams.max_relative_position = 5
-    hparams.add_relative_to_values = True
-
-    # When not in training mode, set all forms of dropout to zero.
-    for key, value in hparams.values().items():
-        if key.endswith("dropout") or key == "label_smoothing":
-            setattr(hparams, key, value * tf.cast(is_training, tf.float32))
-
-    return hparams
+def sequence_to_sentence_features(
+    features: Union[np.ndarray, scipy.sparse.spmatrix]
+) -> Optional[Union[np.ndarray, scipy.sparse.spmatrix]]:
+    """Extract the CLS token vector as sentence features.
+    Features is a sequence. The last token is the CLS token. The feature vector of
+    this token contains the sentence features."""
 
+    if features is None:
+        return None
 
-# noinspection PyUnresolvedReferences
-# noinspection PyPep8Naming
-def create_t2t_transformer_encoder(
-    x_in: "tf.Tensor",
-    mask: "tf.Tensor",
-    attention_weights: Dict[Text, "tf.Tensor"],
-    hparams: "HParams",
-    C2: float,
-    is_training: "tf.Tensor",
-) -> "tf.Tensor":
-    """Create t2t transformer encoder."""
+    if isinstance(features, scipy.sparse.spmatrix):
+        return scipy.sparse.coo_matrix(features.tocsr()[-1])
 
-    with tf.variable_scope("transformer", reuse=tf.AUTO_REUSE):
-        x = create_tf_fnn(
-            x_in,
-            [hparams.hidden_size],
-            hparams.layer_prepostprocess_dropout,
-            C2,
-            is_training,
-            layer_name_suffix="pre_embed",
-            activation=None,
-            use_bias=False,
-            kernel_initializer=tf.random_normal_initializer(
-                0.0, hparams.hidden_size ** -0.5
-            ),
-        )
-        if hparams.multiply_embedding_mode == "sqrt_depth":
-            x *= hparams.hidden_size ** 0.5
+    return np.expand_dims(features[-1], axis=0)
 
-        x *= tf.expand_dims(mask, -1)
-        (
-            x,
-            self_attention_bias,
-            encoder_decoder_attention_bias,
-        ) = transformer_prepare_encoder(x, None, hparams)
 
-        x *= tf.expand_dims(mask, -1)
+def update_evaluation_parameters(config: Dict[Text, Any]) -> Dict[Text, Any]:
+    """
+    If EVAL_NUM_EPOCHS is set to -1, evaluate at the end of the training.
 
-        x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout)
+    Args:
+        config: model configuration
 
-        attn_bias_for_padding = None
-        # Otherwise the encoder will just use encoder_self_attention_bias.
-        if hparams.unidirectional_encoder:
-            attn_bias_for_padding = encoder_decoder_attention_bias
+    Returns: updated model configuration
+    """
 
-        x = transformer_encoder(
-            x,
-            self_attention_bias,
-            hparams,
-            nonpadding=mask,
-            save_weights_to=attention_weights,
-            attn_bias_for_padding=attn_bias_for_padding,
+    if config[EVAL_NUM_EPOCHS] == -1:
+        config[EVAL_NUM_EPOCHS] = config[EPOCHS]
+    elif config[EVAL_NUM_EPOCHS] < 1:
+        raise ValueError(
+            f"'{EVAL_NUM_EXAMPLES}' is set to "
+            f"'{config[EVAL_NUM_EPOCHS]}'. "
+            f"Only values > 1 are allowed for this configuration value."
         )
 
-        x *= tf.expand_dims(mask, -1)
+    return config
 
-        return tf.nn.dropout(tf.nn.relu(x), 1.0 - hparams.layer_prepostprocess_dropout)
 
+def load_tf_hub_model(model_url: Text) -> Any:
+    """Load model from cache if possible, otherwise from TFHub"""
 
-def _tf_make_flat(x: "tf.Tensor") -> "tf.Tensor":
-    """Make tensor 2D."""
+    import tensorflow_hub as tfhub
 
-    return tf.reshape(x, (-1, x.shape[-1]))
+    # needed to load the ConveRT model
+    # noinspection PyUnresolvedReferences
+    import tensorflow_text
+    import os
 
+    # required to take care of cases when other files are already
+    # stored in the default TFHUB_CACHE_DIR
+    try:
+        return tfhub.load(model_url)
+    except OSError:
+        directory = io_utils.create_temporary_directory()
+        os.environ["TFHUB_CACHE_DIR"] = directory
+        return tfhub.load(model_url)
 
-def _tf_sample_neg(
-    batch_size: "tf.Tensor", all_bs: "tf.Tensor", neg_ids: "tf.Tensor"
-) -> "tf.Tensor":
-    """Sample negative examples for given indices"""
 
-    tiled_all_bs = tf.tile(tf.expand_dims(all_bs, 0), (batch_size, 1, 1))
-
-    return tf.batch_gather(tiled_all_bs, neg_ids)
+def _replace_deprecated_option(
+    old_option: Text, new_option: Union[Text, List[Text]], config: Dict[Text, Any]
+) -> Dict[Text, Any]:
+    if old_option in config:
+        if isinstance(new_option, str):
+            logger.warning(
+                f"Option '{old_option}' got renamed to '{new_option}'. "
+                f"Please update your configuration file."
+            )
+            config[new_option] = config[old_option]
+        else:
+            logger.warning(
+                f"Option '{old_option}' got renamed to "
+                f"a dictionary '{new_option[0]}' with a key '{new_option[1]}'. "
+                f"Please update your configuration file."
+            )
+            option_dict = config.get(new_option[0], {})
+            option_dict[new_option[1]] = config[old_option]
+            config[new_option[0]] = option_dict
 
+    return config
 
-def _tf_get_bad_mask(
-    pos_b: "tf.Tensor", all_bs: "tf.Tensor", neg_ids: "tf.Tensor"
-) -> "tf.Tensor":
-    """Calculate bad mask for given indices.
 
-    Checks that input features are different for positive negative samples.
+def check_deprecated_options(config: Dict[Text, Any]) -> Dict[Text, Any]:
     """
+    If old model configuration parameters are present in the provided config, replace
+    them with the new parameters and log a warning.
+    Args:
+        config: model configuration
 
-    pos_b_in_flat = tf.expand_dims(pos_b, -2)
-    neg_b_in_flat = _tf_sample_neg(tf.shape(pos_b)[0], all_bs, neg_ids)
-
-    return tf.cast(
-        tf.reduce_all(tf.equal(neg_b_in_flat, pos_b_in_flat), axis=-1),
-        pos_b_in_flat.dtype,
-    )
-
-
-def _tf_get_negs(
-    all_embed: "tf.Tensor", all_raw: "tf.Tensor", raw_pos: "tf.Tensor", num_neg: int
-) -> Tuple["tf.Tensor", "tf.Tensor"]:
-    """Get negative examples from given tensor."""
-
-    if len(raw_pos.shape) == 3:
-        batch_size = tf.shape(raw_pos)[0]
-        seq_length = tf.shape(raw_pos)[1]
-    else:  # len(raw_pos.shape) == 2
-        batch_size = tf.shape(raw_pos)[0]
-        seq_length = 1
-
-    raw_flat = _tf_make_flat(raw_pos)
-
-    total_candidates = tf.shape(all_embed)[0]
-
-    all_indices = tf.tile(
-        tf.expand_dims(tf.range(0, total_candidates, 1), 0),
-        (batch_size * seq_length, 1),
-    )
-    shuffled_indices = tf.transpose(
-        tf.random.shuffle(tf.transpose(all_indices, (1, 0))), (1, 0)
-    )
-    neg_ids = shuffled_indices[:, :num_neg]
-
-    bad_negs = _tf_get_bad_mask(raw_flat, all_raw, neg_ids)
-    if len(raw_pos.shape) == 3:
-        bad_negs = tf.reshape(bad_negs, (batch_size, seq_length, -1))
-
-    neg_embed = _tf_sample_neg(batch_size * seq_length, all_embed, neg_ids)
-    if len(raw_pos.shape) == 3:
-        neg_embed = tf.reshape(
-            neg_embed, (batch_size, seq_length, -1, all_embed.shape[-1])
-        )
-
-    return neg_embed, bad_negs
-
-
-def sample_negatives(
-    a_embed: "tf.Tensor",
-    b_embed: "tf.Tensor",
-    b_raw: "tf.Tensor",
-    all_b_embed: "tf.Tensor",
-    all_b_raw: "tf.Tensor",
-    num_neg: int,
-) -> Tuple[
-    "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor"
-]:
-    """Sample negative examples."""
+    Returns: updated model configuration
+    """
 
-    neg_dial_embed, dial_bad_negs = _tf_get_negs(
-        _tf_make_flat(a_embed), _tf_make_flat(b_raw), b_raw, num_neg
+    config = _replace_deprecated_option(
+        "hidden_layers_sizes_pre_dial", [HIDDEN_LAYERS_SIZES, DIALOGUE], config
     )
-
-    neg_bot_embed, bot_bad_negs = _tf_get_negs(all_b_embed, all_b_raw, b_raw, num_neg)
-    return (
-        tf.expand_dims(a_embed, -2),
-        tf.expand_dims(b_embed, -2),
-        neg_dial_embed,
-        neg_bot_embed,
-        dial_bad_negs,
-        bot_bad_negs,
+    config = _replace_deprecated_option(
+        "hidden_layers_sizes_bot", [HIDDEN_LAYERS_SIZES, LABEL], config
     )
-
-
-def tf_raw_sim(
-    a: "tf.Tensor", b: "tf.Tensor", mask: Optional["tf.Tensor"]
-) -> "tf.Tensor":
-    """Calculate similarity between given tensors."""
-
-    sim = tf.reduce_sum(a * b, -1)
-    if mask is not None:
-        sim *= tf.expand_dims(mask, 2)
-
-    return sim
-
-
-def tf_sim(
-    pos_dial_embed: "tf.Tensor",
-    pos_bot_embed: "tf.Tensor",
-    neg_dial_embed: "tf.Tensor",
-    neg_bot_embed: "tf.Tensor",
-    dial_bad_negs: "tf.Tensor",
-    bot_bad_negs: "tf.Tensor",
-    mask: Optional["tf.Tensor"],
-) -> Tuple["tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor", "tf.Tensor"]:
-    """Define similarity."""
-
-    # calculate similarity with several
-    # embedded actions for the loss
-    neg_inf = large_compatible_negative(pos_dial_embed.dtype)
-
-    sim_pos = tf_raw_sim(pos_dial_embed, pos_bot_embed, mask)
-    sim_neg = tf_raw_sim(pos_dial_embed, neg_bot_embed, mask) + neg_inf * bot_bad_negs
-    sim_neg_bot_bot = (
-        tf_raw_sim(pos_bot_embed, neg_bot_embed, mask) + neg_inf * bot_bad_negs
+    config = _replace_deprecated_option("droprate", DROP_RATE, config)
+    config = _replace_deprecated_option("droprate_a", DROP_RATE_DIALOGUE, config)
+    config = _replace_deprecated_option("droprate_b", DROP_RATE_LABEL, config)
+    config = _replace_deprecated_option(
+        "hidden_layers_sizes_a", [HIDDEN_LAYERS_SIZES, TEXT], config
     )
-    sim_neg_dial_dial = (
-        tf_raw_sim(pos_dial_embed, neg_dial_embed, mask) + neg_inf * dial_bad_negs
+    config = _replace_deprecated_option(
+        "hidden_layers_sizes_b", [HIDDEN_LAYERS_SIZES, LABEL], config
     )
-    sim_neg_bot_dial = (
-        tf_raw_sim(pos_bot_embed, neg_dial_embed, mask) + neg_inf * dial_bad_negs
+    config = _replace_deprecated_option(
+        "num_transformer_layers", NUM_TRANSFORMER_LAYERS, config
     )
-
-    # output similarities between user input and bot actions
-    # and similarities between bot actions and similarities between user inputs
-    return sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial
-
-
-def tf_calc_accuracy(sim_pos: "tf.Tensor", sim_neg: "tf.Tensor") -> "tf.Tensor":
-    """Calculate accuracy"""
-
-    max_all_sim = tf.reduce_max(tf.concat([sim_pos, sim_neg], -1), -1)
-    return tf.reduce_mean(
-        tf.cast(tf.math.equal(max_all_sim, tf.squeeze(sim_pos, -1)), tf.float32)
+    config = _replace_deprecated_option("num_heads", NUM_HEADS, config)
+    config = _replace_deprecated_option("dense_dim", DENSE_DIMENSION, config)
+    config = _replace_deprecated_option("embed_dim", EMBEDDING_DIMENSION, config)
+    config = _replace_deprecated_option("num_neg", NUM_NEG, config)
+    config = _replace_deprecated_option("mu_pos", MAX_POS_SIM, config)
+    config = _replace_deprecated_option("mu_neg", MAX_NEG_SIM, config)
+    config = _replace_deprecated_option("use_max_sim_neg", USE_MAX_NEG_SIM, config)
+    config = _replace_deprecated_option("C2", REGULARIZATION_CONSTANT, config)
+    config = _replace_deprecated_option("C_emb", NEGATIVE_MARGIN_SCALE, config)
+    config = _replace_deprecated_option(
+        "evaluate_every_num_epochs", EVAL_NUM_EPOCHS, config
     )
-
-
-# noinspection PyPep8Naming
-def tf_loss_margin(
-    sim_pos: "tf.Tensor",
-    sim_neg: "tf.Tensor",
-    sim_neg_bot_bot: "tf.Tensor",
-    sim_neg_dial_dial: "tf.Tensor",
-    sim_neg_bot_dial: "tf.Tensor",
-    mask: Optional["tf.Tensor"],
-    mu_pos: float,
-    mu_neg: float,
-    use_max_sim_neg: bool,
-    C_emb: float,
-) -> "tf.Tensor":
-    """Define max margin loss."""
-
-    # loss for maximizing similarity with correct action
-    loss = tf.maximum(0.0, mu_pos - tf.squeeze(sim_pos, -1))
-
-    # loss for minimizing similarity with `num_neg` incorrect actions
-    if use_max_sim_neg:
-        # minimize only maximum similarity over incorrect actions
-        max_sim_neg = tf.reduce_max(sim_neg, -1)
-        loss += tf.maximum(0.0, mu_neg + max_sim_neg)
-    else:
-        # minimize all similarities with incorrect actions
-        max_margin = tf.maximum(0.0, mu_neg + sim_neg)
-        loss += tf.reduce_sum(max_margin, -1)
-
-    # penalize max similarity between pos bot and neg bot embeddings
-    max_sim_neg_bot = tf.maximum(0.0, tf.reduce_max(sim_neg_bot_bot, -1))
-    loss += max_sim_neg_bot * C_emb
-
-    # penalize max similarity between pos dial and neg dial embeddings
-    max_sim_neg_dial = tf.maximum(0.0, tf.reduce_max(sim_neg_dial_dial, -1))
-    loss += max_sim_neg_dial * C_emb
-
-    # penalize max similarity between pos bot and neg dial embeddings
-    max_sim_neg_dial = tf.maximum(0.0, tf.reduce_max(sim_neg_bot_dial, -1))
-    loss += max_sim_neg_dial * C_emb
-
-    if mask is not None:
-        # mask loss for different length sequences
-        loss *= mask
-        # average the loss over sequence length
-        loss = tf.reduce_sum(loss, -1) / tf.reduce_sum(mask, 1)
-
-    # average the loss over the batch
-    loss = tf.reduce_mean(loss)
-
-    # add regularization losses
-    loss += tf.losses.get_regularization_loss()
-
-    return loss
-
-
-def tf_loss_softmax(
-    sim_pos: "tf.Tensor",
-    sim_neg: "tf.Tensor",
-    sim_neg_bot_bot: "tf.Tensor",
-    sim_neg_dial_dial: "tf.Tensor",
-    sim_neg_bot_dial: "tf.Tensor",
-    mask: Optional["tf.Tensor"],
-    scale_loss: bool,
-) -> "tf.Tensor":
-    """Define softmax loss."""
-
-    logits = tf.concat(
-        [sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial], -1
+    config = _replace_deprecated_option(
+        "evaluate_on_num_examples", EVAL_NUM_EXAMPLES, config
     )
 
-    # create labels for softmax
-    if len(logits.shape) == 3:
-        pos_labels = tf.ones_like(logits[:, :, :1])
-        neg_labels = tf.zeros_like(logits[:, :, 1:])
-    else:  # len(logits.shape) == 2
-        pos_labels = tf.ones_like(logits[:, :1])
-        neg_labels = tf.zeros_like(logits[:, 1:])
-    labels = tf.concat([pos_labels, neg_labels], -1)
-
-    if mask is None:
-        mask = 1.0
-
-    if scale_loss:
-        # mask loss by prediction confidence
-        pred = tf.nn.softmax(logits)
-        if len(pred.shape) == 3:
-            pos_pred = pred[:, :, 0]
-        else:  # len(pred.shape) == 2
-            pos_pred = pred[:, 0]
-        mask *= tf.pow((1 - pos_pred) / 0.5, 4)
-
-    loss = tf.losses.softmax_cross_entropy(labels, logits, mask)
-    # add regularization losses
-    loss += tf.losses.get_regularization_loss()
-
-    return loss
-
-
-# noinspection PyPep8Naming
-def choose_loss(
-    sim_pos: "tf.Tensor",
-    sim_neg: "tf.Tensor",
-    sim_neg_bot_bot: "tf.Tensor",
-    sim_neg_dial_dial: "tf.Tensor",
-    sim_neg_bot_dial: "tf.Tensor",
-    mask: Optional["tf.Tensor"],
-    loss_type: Text,
-    mu_pos: float,
-    mu_neg: float,
-    use_max_sim_neg: bool,
-    C_emb: float,
-    scale_loss: bool,
-) -> "tf.Tensor":
-    """Use loss depending on given option."""
-
-    if loss_type == "margin":
-        return tf_loss_margin(
-            sim_pos,
-            sim_neg,
-            sim_neg_bot_bot,
-            sim_neg_dial_dial,
-            sim_neg_bot_dial,
-            mask,
-            mu_pos,
-            mu_neg,
-            use_max_sim_neg,
-            C_emb,
-        )
-    elif loss_type == "softmax":
-        return tf_loss_softmax(
-            sim_pos,
-            sim_neg,
-            sim_neg_bot_bot,
-            sim_neg_dial_dial,
-            sim_neg_bot_dial,
-            mask,
-            scale_loss,
-        )
-    else:
-        raise ValueError(
-            f"Wrong loss type '{loss_type}', should be 'margin' or 'softmax'."
-        )
-
-
-# noinspection PyPep8Naming
-def calculate_loss_acc(
-    a_embed: "tf.Tensor",
-    b_embed: "tf.Tensor",
-    b_raw: "tf.Tensor",
-    all_b_embed: "tf.Tensor",
-    all_b_raw: "tf.Tensor",
-    num_neg: int,
-    mask: Optional["tf.Tensor"],
-    loss_type: Text,
-    mu_pos: float,
-    mu_neg: float,
-    use_max_sim_neg: bool,
-    C_emb: float,
-    scale_loss: bool,
-) -> Tuple["tf.Tensor", "tf.Tensor"]:
-    """Calculate loss and accuracy."""
-
-    (
-        pos_dial_embed,
-        pos_bot_embed,
-        neg_dial_embed,
-        neg_bot_embed,
-        dial_bad_negs,
-        bot_bad_negs,
-    ) = sample_negatives(a_embed, b_embed, b_raw, all_b_embed, all_b_raw, num_neg)
-
-    # calculate similarities
-    (sim_pos, sim_neg, sim_neg_bot_bot, sim_neg_dial_dial, sim_neg_bot_dial) = tf_sim(
-        pos_dial_embed,
-        pos_bot_embed,
-        neg_dial_embed,
-        neg_bot_embed,
-        dial_bad_negs,
-        bot_bad_negs,
-        mask,
-    )
-
-    acc = tf_calc_accuracy(sim_pos, sim_neg)
-
-    loss = choose_loss(
-        sim_pos,
-        sim_neg,
-        sim_neg_bot_bot,
-        sim_neg_dial_dial,
-        sim_neg_bot_dial,
-        mask,
-        loss_type,
-        mu_pos,
-        mu_neg,
-        use_max_sim_neg,
-        C_emb,
-        scale_loss,
-    )
-
-    return loss, acc
-
-
-def confidence_from_sim(sim: "tf.Tensor", similarity_type: Text) -> "tf.Tensor":
-    if similarity_type == "cosine":
-        # clip negative values to zero
-        return tf.nn.relu(sim)
-    else:
-        # normalize result to [0, 1] with softmax
-        return tf.nn.softmax(sim)
-
-
-def linearly_increasing_batch_size(
-    epoch: int, batch_size: Union[List[int], int], epochs: int
-) -> int:
-    """Linearly increase batch size with every epoch.
-
-    The idea comes from https://arxiv.org/abs/1711.00489.
-    """
-
-    if not isinstance(batch_size, list):
-        return int(batch_size)
-
-    if epochs > 1:
-        return int(
-            batch_size[0] + epoch * (batch_size[1] - batch_size[0]) / (epochs - 1)
-        )
-    else:
-        return int(batch_size[0])
-
-
-def output_validation_stat(
-    eval_init_op: "tf.Operation",
-    loss: "tf.Tensor",
-    acc: "tf.Tensor",
-    session: "tf.Session",
-    is_training: "tf.Session",
-    batch_size_in: "tf.Tensor",
-    ep_batch_size: int,
-) -> Tuple[float, float]:
-    """Output training statistics"""
-
-    session.run(eval_init_op, feed_dict={batch_size_in: ep_batch_size})
-    ep_val_loss = 0
-    ep_val_acc = 0
-    batches_per_epoch = 0
-    while True:
-        try:
-            batch_val_loss, batch_val_acc = session.run(
-                [loss, acc], feed_dict={is_training: False}
-            )
-            batches_per_epoch += 1
-            ep_val_loss += batch_val_loss
-            ep_val_acc += batch_val_acc
-        except tf.errors.OutOfRangeError:
-            break
-
-    return ep_val_loss / batches_per_epoch, ep_val_acc / batches_per_epoch
-
-
-def train_tf_dataset(
-    train_init_op: "tf.Operation",
-    eval_init_op: "tf.Operation",
-    batch_size_in: "tf.Tensor",
-    loss: "tf.Tensor",
-    acc: "tf.Tensor",
-    train_op: "tf.Tensor",
-    session: "tf.Session",
-    is_training: "tf.Session",
-    epochs: int,
-    batch_size: Union[List[int], int],
-    evaluate_on_num_examples: int,
-    evaluate_every_num_epochs: int,
-) -> None:
-    """Train tf graph"""
-
-    session.run(tf.global_variables_initializer())
-
-    if evaluate_on_num_examples:
-        logger.info(
-            f"Validation accuracy is calculated every {evaluate_every_num_epochs} "
-            f"epochs."
-        )
-    pbar = tqdm(range(epochs), desc="Epochs", disable=is_logging_disabled())
-
-    train_loss = 0
-    train_acc = 0
-    val_loss = 0
-    val_acc = 0
-    for ep in pbar:
-
-        ep_batch_size = linearly_increasing_batch_size(ep, batch_size, epochs)
-
-        session.run(train_init_op, feed_dict={batch_size_in: ep_batch_size})
-
-        ep_train_loss = 0
-        ep_train_acc = 0
-        batches_per_epoch = 0
-        while True:
-            try:
-                _, batch_train_loss, batch_train_acc = session.run(
-                    [train_op, loss, acc], feed_dict={is_training: True}
-                )
-                batches_per_epoch += 1
-                ep_train_loss += batch_train_loss
-                ep_train_acc += batch_train_acc
-
-            except tf.errors.OutOfRangeError:
-                break
-
-        train_loss = ep_train_loss / batches_per_epoch
-        train_acc = ep_train_acc / batches_per_epoch
-
-        postfix_dict = {"loss": f"{train_loss:.3f}", "acc": f"{train_acc:.3f}"}
-
-        if eval_init_op is not None:
-            if (ep + 1) % evaluate_every_num_epochs == 0 or (ep + 1) == epochs:
-                val_loss, val_acc = output_validation_stat(
-                    eval_init_op,
-                    loss,
-                    acc,
-                    session,
-                    is_training,
-                    batch_size_in,
-                    ep_batch_size,
-                )
-
-            postfix_dict.update(
-                {"val_loss": f"{val_loss:.3f}", "val_acc": f"{val_acc:.3f}"}
-            )
-
-        pbar.set_postfix(postfix_dict)
-
-    final_message = (
-        f"Finished training embedding policy, "
-        f"train loss={train_loss:.3f}, train accuracy={train_acc:.3f}"
-    )
-    if eval_init_op is not None:
-        final_message += (
-            f", validation loss={val_loss:.3f}, validation accuracy={val_acc:.3f}"
-        )
-    logger.info(final_message)
-
-
-def extract_attention(attention_weights) -> Optional["tf.Tensor"]:
-    """Extract attention probabilities from t2t dict"""
-
-    attention = [
-        tf.expand_dims(t, 0)
-        for name, t in attention_weights.items()
-        # the strings come from t2t library
-        if "multihead_attention/dot_product" in name and not name.endswith("/logits")
-    ]
-
-    if attention:
-        return tf.concat(attention, 0)
-
-
-def persist_tensor(
-    name: Text,
-    tensor: Union["tf.Tensor", Tuple["tf.Tensor"], List["tf.Tensor"]],
-    graph: "tf.Graph",
-) -> None:
-    """Add tensor to collection if it is not None"""
-
-    if tensor is not None:
-        graph.clear_collection(name)
-        if isinstance(tensor, tuple) or isinstance(tensor, list):
-            for t in tensor:
-                graph.add_to_collection(name, t)
-        else:
-            graph.add_to_collection(name, tensor)
-
-
-def load_tensor(name: Text) -> Optional[Union["tf.Tensor", List["tf.Tensor"]]]:
-    """Load tensor or set it to None"""
-
-    tensor_list = tf.get_collection(name)
-
-    if not tensor_list:
-        return None
-
-    if len(tensor_list) == 1:
-        return tensor_list[0]
-
-    return tensor_list
-
-
-def normalize(values: "np.ndarray", ranking_length: Optional[int] = 0) -> "np.ndarray":
-    """Normalizes an array of positive numbers over the top `ranking_length` values.
-
-    Other values will be set to 0.
-    """
-
-    new_values = values.copy()  # prevent mutation of the input
-    if 0 < ranking_length < len(new_values):
-        ranked = sorted(new_values, reverse=True)
-        new_values[new_values < ranked[ranking_length - 1]] = 0
-
-    if np.sum(new_values) > 0:
-        new_values = new_values / np.sum(new_values)
-
-    return new_values
+    return config
diff --git a/rasa/version.py b/rasa/version.py
index 8015df84ebfc..e317dc37ca9b 100644
--- a/rasa/version.py
+++ b/rasa/version.py
@@ -1,3 +1,3 @@
 # this file will automatically be changed,
 # do not add anything but the version number here!
-__version__ = "1.7.4"
+__version__ = "1.8.0a1"
diff --git a/requirements.txt b/requirements.txt
index d1c16eb73437..6233aee22f95 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,17 +7,11 @@ jsonpickle==1.1
 redis==3.3.5
 pymongo[tls,srv]==3.8.0
 numpy==1.16.3
-scipy==1.2.1
-tensorflow-cpu==1.15.0
+scipy==1.4.1
 absl-py>=0.8.0
 # setuptools comes from tensorboard requirement:
 # https://github.com/tensorflow/tensorboard/blob/1.14/tensorboard/pip_package/setup.py#L33
 setuptools >= 41.0.0
-tensorflow-probability==0.7.0
-tensor2tensor==1.14.0
-# fixes https://github.com/RasaHQ/rasa/issues/5231
-# remove when removing `tensor2tensor`
-gym==0.15.4
 apscheduler==3.6.0
 tqdm==4.31.0
 networkx==2.4.0
@@ -63,7 +57,7 @@ kafka-python==1.4.7
 sklearn-crfsuite==0.3.6
 psycopg2-binary==2.8.2
 PyJWT==1.7.1
-# remove when tensorflow@1.15.x or a pre-release patch is released
-# https://github.com/tensorflow/tensorflow/issues/32319
-gast==0.2.2
 python-dateutil==2.8.0
+# for new featurizers
+tensorflow==2.1.0
+tensorflow-addons==0.8.2
diff --git a/setup.py b/setup.py
index affba49353ba..bb703444796d 100644
--- a/setup.py
+++ b/setup.py
@@ -37,18 +37,11 @@
     "pymongo[tls,srv]~=3.8",
     "numpy~=1.16",
     "scipy~=1.2",
-    "tensorflow-cpu~=1.15.0",
+    "tensorflow~=2.1",
+    "tensorflow-addons~=0.8",
     # absl is a tensorflow dependency, but produces double logging before 0.8
     # should be removed once tensorflow requires absl > 0.8 on its own
     "absl-py>=0.8.0",
-    # setuptools comes from tensorboard requirement:
-    # https://github.com/tensorflow/tensorboard/blob/1.14/tensorboard/pip_package/setup.py#L33
-    "setuptools >= 41.0.0",
-    "tensorflow-probability~=0.7.0",
-    "tensor2tensor~=1.14.0",
-    # fixes https://github.com/RasaHQ/rasa/issues/5231
-    # remove when removing `tensor2tensor`
-    "gym<=0.15.4",
     "apscheduler~=3.0",
     "tqdm~=4.0",
     "networkx~=2.4.0",
@@ -92,15 +85,13 @@
     "SQLAlchemy~=1.3.0",
     "sklearn-crfsuite~=0.3.6",
     "PyJWT~=1.7",
-    # remove when tensorflow@1.15.x or a pre-release patch is released
-    # https://github.com/tensorflow/tensorflow/issues/32319
-    "gast==0.2.2",
 ]
 
 extras_requires = {
     "test": tests_requires,
     "spacy": ["spacy>=2.1,<2.2"],
-    "convert": ["tensorflow_text~=1.15.1", "tensorflow_hub~=0.6.0"],
+    "convert": ["tensorflow_text>=2.1.0rc0", "tensorflow_hub~=0.7.0"],
+    "transformers": ["transformers~=2.3.0"],
     "mitie": ["mitie"],
     "sql": ["psycopg2~=2.8.2", "SQLAlchemy~=1.3"],
     "kafka": ["kafka-python~=1.4"],
diff --git a/tests/cli/conftest.py b/tests/cli/conftest.py
index 2ac6e97f47e3..815030044616 100644
--- a/tests/cli/conftest.py
+++ b/tests/cli/conftest.py
@@ -5,9 +5,12 @@
 from _pytest.tmpdir import TempdirFactory
 from typing import Callable
 import pytest
+import shutil
 import os
 from _pytest.pytester import Testdir, RunResult
 
+from rasa.utils.io import write_yaml_file
+
 
 @pytest.fixture
 def run(testdir: Testdir) -> Callable[..., RunResult]:
@@ -33,9 +36,23 @@ def init_default_project(tmpdir_factory: TempdirFactory) -> str:
     os.environ["LOG_LEVEL"] = "ERROR"
 
     check_call(["rasa", "init", "--no-prompt"], cwd=path)
+
     return path
 
 
+@pytest.fixture
+def run_in_default_project_without_models(testdir: Testdir) -> Callable[..., RunResult]:
+    os.environ["LOG_LEVEL"] = "ERROR"
+
+    _set_up_initial_project(testdir)
+
+    def do_run(*args):
+        args = ["rasa"] + list(args)
+        return testdir.run(*args)
+
+    return do_run
+
+
 @pytest.fixture
 def run_in_default_project(
     testdir: Testdir, init_default_project: str
@@ -57,3 +74,30 @@ def do_run(*args):
         return result
 
     return do_run
+
+
+def _set_up_initial_project(testdir: Testdir):
+    # copy initial project files
+    testdir.copy_example("rasa/cli/initial_project/actions.py")
+    testdir.copy_example("rasa/cli/initial_project/credentials.yml")
+    testdir.copy_example("rasa/cli/initial_project/domain.yml")
+    testdir.copy_example("rasa/cli/initial_project/endpoints.yml")
+    testdir.mkdir("data")
+    testdir.copy_example("rasa/cli/initial_project/data")
+    testdir.run("mv", "nlu.md", "data/nlu.md")
+    testdir.run("mv", "stories.md", "data/stories.md")
+
+    # create a config file
+    # for the cli test the resulting model is not important, use components that are
+    # fast to train
+    write_yaml_file(
+        {
+            "language": "en",
+            "pipeline": [{"name": "KeywordIntentClassifier"}],
+            "policies": [
+                {"name": "MappingPolicy"},
+                {"name": "MemoizationPolicy", "max_history": 5},
+            ],
+        },
+        "config.yml",
+    )
diff --git a/tests/cli/test_rasa_data.py b/tests/cli/test_rasa_data.py
index 07e54e8c47ed..937728b59c61 100644
--- a/tests/cli/test_rasa_data.py
+++ b/tests/cli/test_rasa_data.py
@@ -12,8 +12,10 @@
 from rasa.validator import Validator
 
 
-def test_data_split_nlu(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project(
+def test_data_split_nlu(
+    run_in_default_project_without_models: Callable[..., RunResult]
+):
+    run_in_default_project_without_models(
         "data", "split", "nlu", "-u", "data/nlu.md", "--training-fraction", "0.75"
     )
 
@@ -22,8 +24,10 @@ def test_data_split_nlu(run_in_default_project: Callable[..., RunResult]):
     assert os.path.exists(os.path.join("train_test_split", "training_data.md"))
 
 
-def test_data_convert_nlu(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project(
+def test_data_convert_nlu(
+    run_in_default_project_without_models: Callable[..., RunResult]
+):
+    run_in_default_project_without_models(
         "data",
         "convert",
         "nlu",
diff --git a/tests/cli/test_rasa_interactive.py b/tests/cli/test_rasa_interactive.py
index 630e3ff451d3..b869ebedd7ac 100644
--- a/tests/cli/test_rasa_interactive.py
+++ b/tests/cli/test_rasa_interactive.py
@@ -68,7 +68,7 @@ def test_pass_arguments_to_rasa_train(
 
 
 def test_train_called_when_no_model_passed(
-    default_stack_config: Text, monkeypatch: MonkeyPatch,
+    default_stack_config: Text, monkeypatch: MonkeyPatch
 ) -> None:
     parser = argparse.ArgumentParser()
     sub_parser = parser.add_subparsers()
@@ -97,7 +97,7 @@ def test_train_called_when_no_model_passed(
 
 
 def test_train_core_called_when_no_model_passed_and_core(
-    default_stack_config: Text, monkeypatch: MonkeyPatch,
+    default_stack_config: Text, monkeypatch: MonkeyPatch
 ) -> None:
     parser = argparse.ArgumentParser()
     sub_parser = parser.add_subparsers()
@@ -129,7 +129,7 @@ def test_train_core_called_when_no_model_passed_and_core(
 
 
 def test_no_interactive_without_core_data(
-    default_stack_config: Text, monkeypatch: MonkeyPatch,
+    default_stack_config: Text, monkeypatch: MonkeyPatch
 ) -> None:
     parser = argparse.ArgumentParser()
     sub_parser = parser.add_subparsers()
diff --git a/tests/cli/test_rasa_run.py b/tests/cli/test_rasa_run.py
index 8de8685e91dd..7a054b26619a 100644
--- a/tests/cli/test_rasa_run.py
+++ b/tests/cli/test_rasa_run.py
@@ -4,12 +4,13 @@
 from _pytest.pytester import RunResult
 
 
-def test_run_does_not_start(run_in_default_project: Callable[..., RunResult]):
+def test_run_does_not_start(
+    run_in_default_project_without_models: Callable[..., RunResult]
+):
     os.remove("domain.yml")
-    shutil.rmtree("models")
 
     # the server should not start as no model is configured
-    output = run_in_default_project("run")
+    output = run_in_default_project_without_models("run")
 
     assert "No model found." in output.outlines[0]
 
diff --git a/tests/cli/test_rasa_test.py b/tests/cli/test_rasa_test.py
index 50d21c6e6978..fa21e22c76f3 100644
--- a/tests/cli/test_rasa_test.py
+++ b/tests/cli/test_rasa_test.py
@@ -58,11 +58,22 @@ def test_test_nlu_cross_validation(run_in_default_project: Callable[..., RunResu
     assert os.path.exists("results/confmat.png")
 
 
-def test_test_nlu_comparison(run_in_default_project: Callable[..., RunResult]):
-    copyfile("config.yml", "nlu-config.yml")
+def test_test_nlu_comparison(
+    run_in_default_project_without_models: Callable[..., RunResult]
+):
+    copyfile("config.yml", "config-1.yml")
 
-    run_in_default_project(
-        "test", "nlu", "-c", "config.yml", "nlu-config.yml", "--run", "2"
+    run_in_default_project_without_models(
+        "test",
+        "nlu",
+        "--config",
+        "config.yml",
+        "config-1.yml",
+        "--run",
+        "2",
+        "--percentages",
+        "75",
+        "25",
     )
 
     assert os.path.exists("results/run_1")
@@ -106,6 +117,7 @@ def test_test_core_comparison_after_train(
         },
         "config_2.yml",
     )
+
     run_in_default_project(
         "train",
         "core",
diff --git a/tests/cli/test_rasa_train.py b/tests/cli/test_rasa_train.py
index a451c5d914c8..d0147a575ea3 100644
--- a/tests/cli/test_rasa_train.py
+++ b/tests/cli/test_rasa_train.py
@@ -18,10 +18,10 @@
 import rasa.utils.io as io_utils
 
 
-def test_train(run_in_default_project: Callable[..., RunResult]):
+def test_train(run_in_default_project_without_models: Callable[..., RunResult]):
     temp_dir = os.getcwd()
 
-    run_in_default_project(
+    run_in_default_project_without_models(
         "train",
         "-c",
         "config.yml",
@@ -48,10 +48,12 @@ def test_train(run_in_default_project: Callable[..., RunResult]):
     )
 
 
-def test_train_persist_nlu_data(run_in_default_project: Callable[..., RunResult]):
+def test_train_persist_nlu_data(
+    run_in_default_project_without_models: Callable[..., RunResult]
+):
     temp_dir = os.getcwd()
 
-    run_in_default_project(
+    run_in_default_project_without_models(
         "train",
         "-c",
         "config.yml",
@@ -79,7 +81,9 @@ def test_train_persist_nlu_data(run_in_default_project: Callable[..., RunResult]
     )
 
 
-def test_train_core_compare(run_in_default_project: Callable[..., RunResult]):
+def test_train_core_compare(
+    run_in_default_project_without_models: Callable[..., RunResult]
+):
     temp_dir = os.getcwd()
 
     io_utils.write_yaml_file(
@@ -100,7 +104,7 @@ def test_train_core_compare(run_in_default_project: Callable[..., RunResult]):
         "config_2.yml",
     )
 
-    run_in_default_project(
+    run_in_default_project_without_models(
         "train",
         "core",
         "-c",
@@ -132,11 +136,11 @@ def test_train_core_compare(run_in_default_project: Callable[..., RunResult]):
 
 
 def test_train_no_domain_exists(
-    run_in_default_project: Callable[..., RunResult]
+    run_in_default_project_without_models: Callable[..., RunResult]
 ) -> None:
 
     os.remove("domain.yml")
-    run_in_default_project(
+    run_in_default_project_without_models(
         "train",
         "-c",
         "config.yml",
@@ -191,14 +195,13 @@ def test_train_force(run_in_default_project):
     assert len(files) == 2
 
 
-def test_train_with_only_nlu_data(run_in_default_project):
+def test_train_with_only_nlu_data(run_in_default_project_without_models):
     temp_dir = os.getcwd()
 
     assert os.path.exists(os.path.join(temp_dir, "data/stories.md"))
     os.remove(os.path.join(temp_dir, "data/stories.md"))
-    shutil.rmtree(os.path.join(temp_dir, "models"))
 
-    run_in_default_project("train", "--fixed-model-name", "test-model")
+    run_in_default_project_without_models("train", "--fixed-model-name", "test-model")
 
     assert os.path.exists(os.path.join(temp_dir, "models"))
     files = io_utils.list_files(os.path.join(temp_dir, "models"))
@@ -206,14 +209,13 @@ def test_train_with_only_nlu_data(run_in_default_project):
     assert os.path.basename(files[0]) == "test-model.tar.gz"
 
 
-def test_train_with_only_core_data(run_in_default_project):
+def test_train_with_only_core_data(run_in_default_project_without_models):
     temp_dir = os.getcwd()
 
     assert os.path.exists(os.path.join(temp_dir, "data/nlu.md"))
     os.remove(os.path.join(temp_dir, "data/nlu.md"))
-    shutil.rmtree(os.path.join(temp_dir, "models"))
 
-    run_in_default_project("train", "--fixed-model-name", "test-model")
+    run_in_default_project_without_models("train", "--fixed-model-name", "test-model")
 
     assert os.path.exists(os.path.join(temp_dir, "models"))
     files = io_utils.list_files(os.path.join(temp_dir, "models"))
@@ -221,8 +223,8 @@ def test_train_with_only_core_data(run_in_default_project):
     assert os.path.basename(files[0]) == "test-model.tar.gz"
 
 
-def test_train_core(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project(
+def test_train_core(run_in_default_project_without_models: Callable[..., RunResult]):
+    run_in_default_project_without_models(
         "train",
         "core",
         "-c",
@@ -241,10 +243,12 @@ def test_train_core(run_in_default_project: Callable[..., RunResult]):
     assert os.path.isfile("train_rasa_models/rasa-model.tar.gz")
 
 
-def test_train_core_no_domain_exists(run_in_default_project: Callable[..., RunResult]):
+def test_train_core_no_domain_exists(
+    run_in_default_project_without_models: Callable[..., RunResult]
+):
 
     os.remove("domain.yml")
-    run_in_default_project(
+    run_in_default_project_without_models(
         "train",
         "core",
         "--config",
@@ -263,8 +267,8 @@ def test_train_core_no_domain_exists(run_in_default_project: Callable[..., RunRe
     assert not os.path.isfile("train_rasa_models_no_domain/rasa-model.tar.gz")
 
 
-def test_train_nlu(run_in_default_project: Callable[..., RunResult]):
-    run_in_default_project(
+def test_train_nlu(run_in_default_project_without_models: Callable[..., RunResult]):
+    run_in_default_project_without_models(
         "train",
         "nlu",
         "-c",
@@ -289,9 +293,9 @@ def test_train_nlu(run_in_default_project: Callable[..., RunResult]):
 
 
 def test_train_nlu_persist_nlu_data(
-    run_in_default_project: Callable[..., RunResult]
+    run_in_default_project_without_models: Callable[..., RunResult]
 ) -> None:
-    run_in_default_project(
+    run_in_default_project_without_models(
         "train",
         "nlu",
         "-c",
diff --git a/tests/conftest.py b/tests/conftest.py
index ad1823daba05..95d696483338 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,5 @@
 import asyncio
+import os
 import random
 import uuid
 
@@ -30,6 +31,7 @@
 from rasa.model import get_model
 from rasa.train import train_async
 from rasa.utils.common import TempDirectoryPath
+import rasa.utils.io as io_utils
 from tests.core.conftest import (
     DEFAULT_DOMAIN_PATH_WITH_SLOTS,
     DEFAULT_NLU_DATA,
@@ -38,7 +40,7 @@
     END_TO_END_STORY_FILE,
     MOODBOT_MODEL_PATH,
 )
-import rasa.utils.io as io_utils
+from tests.utilities import update_number_of_epochs
 
 DEFAULT_CONFIG_PATH = "rasa/cli/default_config.yml"
 
@@ -84,10 +86,15 @@ async def default_agent(_trained_default_agent: Agent) -> Agent:
 
 
 @pytest.fixture(scope="session")
-async def trained_moodbot_path() -> Text:
+async def trained_moodbot_path(tmpdir_factory: TempdirFactory) -> Text:
+    output = tmpdir_factory.mktemp("moodbot").strpath
+    tmp_config_file = os.path.join(output, "config.yml")
+
+    update_number_of_epochs("examples/moodbot/config.yml", tmp_config_file)
+
     return await train_async(
         domain="examples/moodbot/domain.yml",
-        config="examples/moodbot/config.yml",
+        config=tmp_config_file,
         training_files="examples/moodbot/data/",
         output_path=MOODBOT_MODEL_PATH,
     )
@@ -252,9 +259,7 @@ def write_endpoint_config_to_yaml(
     endpoints_path = path / endpoints_filename
 
     # write endpoints config to file
-    io_utils.write_yaml_file(
-        data, endpoints_path,
-    )
+    io_utils.write_yaml_file(data, endpoints_path)
     return endpoints_path
 
 
diff --git a/tests/core/conftest.py b/tests/core/conftest.py
index 2613fcc98d7c..1d314f22b0dc 100644
--- a/tests/core/conftest.py
+++ b/tests/core/conftest.py
@@ -84,9 +84,7 @@ def __init__(self, example_arg):
 class MockedMongoTrackerStore(MongoTrackerStore):
     """In-memory mocked version of `MongoTrackerStore`."""
 
-    def __init__(
-        self, _domain: Domain,
-    ):
+    def __init__(self, _domain: Domain):
         from mongomock import MongoClient
 
         self.db = MongoClient().rasa
@@ -181,7 +179,7 @@ def tracker_with_six_scheduled_reminders(
         ),
         ReminderScheduled("default", datetime.now(), kill_on_user_message=False),
         ReminderScheduled(
-            "default", datetime.now(), kill_on_user_message=False, name="special",
+            "default", datetime.now(), kill_on_user_message=False, name="special"
         ),
     ]
     sender_id = uuid.uuid4().hex
diff --git a/tests/core/test_agent.py b/tests/core/test_agent.py
index d181d847cc56..0f07a33af7ac 100644
--- a/tests/core/test_agent.py
+++ b/tests/core/test_agent.py
@@ -5,7 +5,7 @@
 from sanic import Sanic, response
 
 import rasa.core
-from rasa.core.policies.embedding_policy import EmbeddingPolicy
+from rasa.core.policies.ted_policy import TEDPolicy
 from rasa.core.policies.mapping_policy import MappingPolicy
 import rasa.utils.io
 from rasa.core import jobs, utils
@@ -81,7 +81,7 @@ async def test_agent_train(trained_moodbot_path: Text):
     # test policies
     assert isinstance(loaded.policy_ensemble, SimplePolicyEnsemble)
     assert [type(p) for p in loaded.policy_ensemble.policies] == [
-        EmbeddingPolicy,
+        TEDPolicy,
         MemoizationPolicy,
         MappingPolicy,
     ]
diff --git a/tests/core/test_interactive.py b/tests/core/test_interactive.py
index 341081949f74..c0e7cd854eb3 100644
--- a/tests/core/test_interactive.py
+++ b/tests/core/test_interactive.py
@@ -32,7 +32,7 @@ def mock_endpoint():
 
 @pytest.fixture
 def mock_file_importer(
-    default_stack_config: Text, default_nlu_data: Text, default_stories_file: Text,
+    default_stack_config: Text, default_nlu_data: Text, default_stories_file: Text
 ):
     domain_path = DEFAULT_DOMAIN_PATH_WITH_SLOTS
     return TrainingDataImporter.load_from_config(
diff --git a/tests/core/test_lock_store.py b/tests/core/test_lock_store.py
index 68e76896917a..fdb192074f0d 100644
--- a/tests/core/test_lock_store.py
+++ b/tests/core/test_lock_store.py
@@ -12,12 +12,7 @@
 from rasa.core.channels import UserMessage
 from rasa.core.constants import INTENT_MESSAGE_PREFIX, DEFAULT_LOCK_LIFETIME
 from rasa.core.lock import TicketLock
-from rasa.core.lock_store import (
-    InMemoryLockStore,
-    LockError,
-    LockStore,
-    RedisLockStore,
-)
+from rasa.core.lock_store import InMemoryLockStore, LockError, LockStore, RedisLockStore
 
 
 class FakeRedisLockStore(RedisLockStore):
diff --git a/tests/core/test_nlg.py b/tests/core/test_nlg.py
index 4d5ac70975d5..fa890404844a 100644
--- a/tests/core/test_nlg.py
+++ b/tests/core/test_nlg.py
@@ -1,13 +1,10 @@
-import asyncio
 import uuid
 from typing import Text, Any
 
 import jsonschema
 import pytest
-from flask import Flask, request, jsonify
-from pytest_localserver.http import WSGIServer
+from sanic import Sanic, response
 
-import rasa.utils.io
 from rasa.core.nlg.callback import (
     nlg_request_format_spec,
     CallbackNaturalLanguageGenerator,
@@ -19,10 +16,11 @@
 
 
 def nlg_app(base_url="/"):
-    app = Flask(__name__)
+
+    app = Sanic(__name__)
 
     @app.route(base_url, methods=["POST"])
-    def generate():
+    async def generate(request):
         """Simple HTTP NLG generator, checks that the incoming request
         is format according to the spec."""
 
@@ -31,28 +29,26 @@ def generate():
         jsonschema.validate(nlg_call, nlg_request_format_spec())
 
         if nlg_call.get("template") == "utter_greet":
-            response = {"text": "Hey there!"}
+            response_dict = {"text": "Hey there!"}
         else:
-            response = {"text": "Sorry, didn't get that."}
-        return jsonify(response)
+            response_dict = {"text": "Sorry, didn't get that."}
+        return response.json(response_dict)
 
     return app
 
 
 # noinspection PyShadowingNames
-@pytest.fixture(scope="module")
-def http_nlg(request):
-    http_server = WSGIServer(application=nlg_app())
-    http_server.start()
-
-    request.addfinalizer(http_server.stop)
-    return http_server.url
+@pytest.fixture()
+async def http_nlg(test_server):
+    server = await test_server(nlg_app())
+    yield server
+    await server.close()
 
 
 async def test_nlg(http_nlg, trained_rasa_model):
     sender = str(uuid.uuid1())
 
-    nlg_endpoint = EndpointConfig.from_dict({"url": http_nlg})
+    nlg_endpoint = EndpointConfig.from_dict({"url": http_nlg.make_url("/")})
     agent = Agent.load(trained_rasa_model, None, generator=nlg_endpoint)
 
     response = await agent.handle_text("/greet", sender_id=sender)
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 7dd73f476229..381b98aad4b6 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -24,7 +24,7 @@
     FullDialogueTrackerFeaturizer,
 )
 from rasa.core.policies.two_stage_fallback import TwoStageFallbackPolicy
-from rasa.core.policies.embedding_policy import EmbeddingPolicy
+from rasa.core.policies.ted_policy import TEDPolicy
 from rasa.core.policies.fallback import FallbackPolicy
 from rasa.core.policies.form_policy import FormPolicy
 from rasa.core.policies.keras_policy import KerasPolicy
@@ -32,6 +32,17 @@
 from rasa.core.policies.memoization import AugmentedMemoizationPolicy, MemoizationPolicy
 from rasa.core.policies.sklearn_policy import SklearnPolicy
 from rasa.core.trackers import DialogueStateTracker
+from rasa.utils.tensorflow.constants import (
+    SIMILARITY_TYPE,
+    RANKING_LENGTH,
+    LOSS_TYPE,
+    SCALE_LOSS,
+    EVAL_NUM_EXAMPLES,
+    EPOCHS,
+    KEY_RELATIVE_ATTENTION,
+    VALUE_RELATIVE_ATTENTION,
+    MAX_RELATIVE_POSITION,
+)
 from rasa.utils import train_utils
 from tests.core.conftest import (
     DEFAULT_DOMAIN_PATH_WITH_MAPPING,
@@ -41,35 +52,6 @@
 from tests.core.utilities import get_tracker, read_dialogue_file, user_uttered
 
 
-def tf_defaults():
-    return {
-        "tf_config": {
-            "device_count": {"CPU": 4},
-            # tell tf.Session to use CPU limit, if you have
-            # more CPU, you can increase this value appropriately
-            "inter_op_parallelism_threads": 0,
-            # the number of threads in the thread pool available
-            # for each process for blocking operation nodes set to 0
-            # to allow the system to select the appropriate value.
-            "intra_op_parallelism_threads": 0,  # tells the degree of thread
-            # parallelism of the tf.Session operation.
-            # the smaller the value, the less reuse the thread will have
-            # and the more likely it will use more CPU cores.
-            # if the value is 0,
-            # tensorflow will automatically select an appropriate value.
-            "gpu_options": {"allow_growth": True}
-            # if set True, will try to allocate
-            # as much GPU memory as possible to support running
-        }
-    }
-
-
-def session_config():
-    import tensorflow as tf
-
-    return tf.ConfigProto(**tf_defaults()["tf_config"])
-
-
 async def train_trackers(domain, augmentation_factor=20):
     return await training.load_data(
         DEFAULT_STORIES_FILE, domain, augmentation_factor=augmentation_factor
@@ -168,17 +150,6 @@ def test_persist_and_load_empty_policy(self, tmpdir):
         loaded = empty_policy.__class__.load(tmpdir.strpath)
         assert loaded is not None
 
-    def test_tf_config(self, trained_policy, tmpdir):
-        if hasattr(trained_policy, "session"):
-            import tensorflow as tf
-
-            # noinspection PyProtectedMember
-            assert trained_policy.session._config == tf.Session()._config
-            trained_policy.persist(tmpdir.strpath)
-            loaded = trained_policy.__class__.load(tmpdir.strpath)
-            # noinspection PyProtectedMember
-            assert loaded.session._config == tf.Session()._config
-
     @staticmethod
     def _get_next_action(policy, events, domain):
         tracker = get_tracker(events)
@@ -194,20 +165,6 @@ def create_policy(self, featurizer, priority):
         return p
 
 
-class TestKerasPolicyWithTfConfig(PolicyTestCollection):
-    def create_policy(self, featurizer, priority):
-        p = KerasPolicy(featurizer, priority, **tf_defaults())
-        return p
-
-    def test_tf_config(self, trained_policy, tmpdir):
-        # noinspection PyProtectedMember
-        assert trained_policy.session._config == session_config()
-        trained_policy.persist(tmpdir.strpath)
-        loaded = trained_policy.__class__.load(tmpdir.strpath)
-        # noinspection PyProtectedMember
-        assert loaded.session._config == session_config()
-
-
 class TestSklearnPolicy(PolicyTestCollection):
     def create_policy(self, featurizer, priority, **kwargs):
         p = SklearnPolicy(featurizer, priority, **kwargs)
@@ -323,16 +280,16 @@ def test_train_with_shuffle_false(
         policy.train(trackers, domain=default_domain)
 
 
-class TestEmbeddingPolicy(PolicyTestCollection):
+class TestTEDPolicy(PolicyTestCollection):
     def create_policy(self, featurizer, priority):
-        p = EmbeddingPolicy(featurizer=featurizer, priority=priority)
+        p = TEDPolicy(featurizer=featurizer, priority=priority)
         return p
 
     def test_similarity_type(self, trained_policy):
-        assert trained_policy.similarity_type == "inner"
+        assert trained_policy.config[SIMILARITY_TYPE] == "inner"
 
     def test_ranking_length(self, trained_policy):
-        assert trained_policy.ranking_length == 10
+        assert trained_policy.config[RANKING_LENGTH] == 10
 
     def test_normalization(self, trained_policy, tracker, default_domain, monkeypatch):
         # first check the output is what we expect
@@ -342,7 +299,7 @@ def test_normalization(self, trained_policy, tracker, default_domain, monkeypatc
         # count number of non-zero confidences
         assert (
             sum([confidence > 0 for confidence in predicted_probabilities])
-            == trained_policy.ranking_length
+            == trained_policy.config[RANKING_LENGTH]
         )
         # check that the norm is still 1
         assert sum(predicted_probabilities) == pytest.approx(1)
@@ -359,45 +316,33 @@ async def test_gen_batch(self, trained_policy, default_domain):
         training_data = trained_policy.featurize_for_training(
             training_trackers, default_domain
         )
-        session_data = trained_policy._create_session_data(
-            training_data.X, training_data.y
-        )
+        model_data = trained_policy._create_model_data(training_data.X, training_data.y)
         batch_size = 2
-        batch_x, batch_y, _ = next(
-            train_utils.gen_batch(
-                session_data=session_data, batch_size=batch_size, label_key="action_ids"
-            )
-        )
+        batch_x, batch_y, _ = next(model_data._gen_batch(batch_size=batch_size))
         assert batch_x.shape[0] == batch_size and batch_y.shape[0] == batch_size
         assert (
-            batch_x[0].shape == session_data["dialogue_features"][0][0].shape
-            and batch_y[0].shape == session_data["bot_features"][0][0].shape
+            batch_x[0].shape == model_data.get("dialogue_features")[0][0].shape
+            and batch_y[0].shape == model_data.get("label_features")[0][0].shape
         )
         batch_x, batch_y, _ = next(
-            train_utils.gen_batch(
-                session_data=session_data,
-                batch_size=batch_size,
-                label_key="action_ids",
-                batch_strategy="balanced",
-                shuffle=True,
+            model_data._gen_batch(
+                batch_size=batch_size, batch_strategy="balanced", shuffle=True
             )
         )
         assert batch_x.shape[0] == batch_size and batch_y.shape[0] == batch_size
         assert (
-            batch_x[0].shape == session_data["dialogue_features"][0][0].shape
-            and batch_y[0].shape == session_data["bot_features"][0][0].shape
+            batch_x[0].shape == model_data.get("dialogue_features")[0][0].shape
+            and batch_y[0].shape == model_data.get("label_features")[0][0].shape
         )
 
 
-class TestEmbeddingPolicyMargin(TestEmbeddingPolicy):
+class TestTEDPolicyMargin(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
-        p = EmbeddingPolicy(
-            featurizer=featurizer, priority=priority, **{"loss_type": "margin"}
-        )
+        p = TEDPolicy(featurizer=featurizer, priority=priority, **{LOSS_TYPE: "margin"})
         return p
 
     def test_similarity_type(self, trained_policy):
-        assert trained_policy.similarity_type == "cosine"
+        assert trained_policy.config[SIMILARITY_TYPE] == "cosine"
 
     def test_normalization(self, trained_policy, tracker, default_domain, monkeypatch):
         # Mock actual normalization method
@@ -409,25 +354,23 @@ def test_normalization(self, trained_policy, tracker, default_domain, monkeypatc
         mock.normalize.assert_not_called()
 
 
-class TestEmbeddingPolicyWithEval(TestEmbeddingPolicy):
+class TestTEDPolicyWithEval(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
-        p = EmbeddingPolicy(
+        p = TEDPolicy(
             featurizer=featurizer,
             priority=priority,
-            **{"scale_loss": False, "evaluate_on_num_examples": 4},
+            **{SCALE_LOSS: False, EVAL_NUM_EXAMPLES: 4},
         )
         return p
 
 
-class TestEmbeddingPolicyNoNormalization(TestEmbeddingPolicy):
+class TestTEDPolicyNoNormalization(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
-        p = EmbeddingPolicy(
-            featurizer=featurizer, priority=priority, **{"ranking_length": 0}
-        )
+        p = TEDPolicy(featurizer=featurizer, priority=priority, **{RANKING_LENGTH: 0})
         return p
 
     def test_ranking_length(self, trained_policy):
-        assert trained_policy.ranking_length == 0
+        assert trained_policy.config[RANKING_LENGTH] == 0
 
     def test_normalization(self, trained_policy, tracker, default_domain, monkeypatch):
         # first check the output is what we expect
@@ -445,34 +388,30 @@ def test_normalization(self, trained_policy, tracker, default_domain, monkeypatc
         mock.normalize.assert_not_called()
 
 
-class TestEmbeddingPolicyLowRankingLength(TestEmbeddingPolicy):
+class TestTEDPolicyLowRankingLength(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
-        p = EmbeddingPolicy(
-            featurizer=featurizer, priority=priority, **{"ranking_length": 3}
-        )
+        p = TEDPolicy(featurizer=featurizer, priority=priority, **{RANKING_LENGTH: 3})
         return p
 
     def test_ranking_length(self, trained_policy):
-        assert trained_policy.ranking_length == 3
+        assert trained_policy.config[RANKING_LENGTH] == 3
 
 
-class TestEmbeddingPolicyHighRankingLength(TestEmbeddingPolicy):
+class TestTEDPolicyHighRankingLength(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
-        p = EmbeddingPolicy(
-            featurizer=featurizer, priority=priority, **{"ranking_length": 11}
-        )
+        p = TEDPolicy(featurizer=featurizer, priority=priority, **{RANKING_LENGTH: 11})
         return p
 
     def test_ranking_length(self, trained_policy):
-        assert trained_policy.ranking_length == 11
+        assert trained_policy.config[RANKING_LENGTH] == 11
 
 
-class TestEmbeddingPolicyWithFullDialogue(TestEmbeddingPolicy):
+class TestTEDPolicyWithFullDialogue(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
-        # use standard featurizer from EmbeddingPolicy,
+        # use standard featurizer from TEDPolicy,
         # since it is using FullDialogueTrackerFeaturizer
         # if max_history is not specified
-        p = EmbeddingPolicy(priority=priority)
+        p = TEDPolicy(priority=priority)
         return p
 
     def test_featurizer(self, trained_policy, tmpdir):
@@ -489,12 +428,12 @@ def test_featurizer(self, trained_policy, tmpdir):
         )
 
 
-class TestEmbeddingPolicyWithMaxHistory(TestEmbeddingPolicy):
+class TestTEDPolicyWithMaxHistory(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
-        # use standard featurizer from EmbeddingPolicy,
+        # use standard featurizer from TEDPolicy,
         # since it is using MaxHistoryTrackerFeaturizer
         # if max_history is specified
-        p = EmbeddingPolicy(priority=priority, max_history=self.max_history)
+        p = TEDPolicy(priority=priority, max_history=self.max_history)
         return p
 
     def test_featurizer(self, trained_policy, tmpdir):
@@ -513,19 +452,19 @@ def test_featurizer(self, trained_policy, tmpdir):
         )
 
 
-class TestEmbeddingPolicyWithTfConfig(TestEmbeddingPolicy):
+class TestTEDPolicyWithRelativeAttention(TestTEDPolicy):
     def create_policy(self, featurizer, priority):
-        p = EmbeddingPolicy(featurizer=featurizer, priority=priority, **tf_defaults())
+        p = TEDPolicy(
+            featurizer=featurizer,
+            priority=priority,
+            **{
+                KEY_RELATIVE_ATTENTION: True,
+                VALUE_RELATIVE_ATTENTION: True,
+                MAX_RELATIVE_POSITION: 5,
+            },
+        )
         return p
 
-    def test_tf_config(self, trained_policy, tmpdir):
-        # noinspection PyProtectedMember
-        assert trained_policy.session._config == session_config()
-        trained_policy.persist(tmpdir.strpath)
-        loaded = trained_policy.__class__.load(tmpdir.strpath)
-        # noinspection PyProtectedMember
-        assert loaded.session._config == session_config()
-
 
 class TestMemoizationPolicy(PolicyTestCollection):
     def create_policy(self, featurizer, priority):
diff --git a/tests/core/test_processor.py b/tests/core/test_processor.py
index 18938ca4ee08..d67a3f07be2e 100644
--- a/tests/core/test_processor.py
+++ b/tests/core/test_processor.py
@@ -424,7 +424,7 @@ async def test_reminder_restart(
         # last user event is way in the past
         (UserUttered(timestamp=1), 60, True),
         # user event are very recent
-        (UserUttered("hello", timestamp=time.time()), 60, False,),
+        (UserUttered("hello", timestamp=time.time()), 120, False),
         # there is user event
         (ActionExecuted(ACTION_LISTEN_NAME, timestamp=time.time()), 60, False),
         # Old event, but sessions are disabled
@@ -517,10 +517,7 @@ async def test_update_tracker_session_with_slots(
     events = list(tracker.events)
 
     # the first three events should be up to the user utterance
-    assert events[:2] == [
-        ActionExecuted(ACTION_LISTEN_NAME),
-        user_event,
-    ]
+    assert events[:2] == [ActionExecuted(ACTION_LISTEN_NAME), user_event]
 
     # next come the five slots
     assert events[2:7] == slot_set_events
@@ -537,7 +534,7 @@ async def test_update_tracker_session_with_slots(
 
 # noinspection PyProtectedMember
 async def test_get_tracker_with_session_start(
-    default_channel: CollectingOutputChannel, default_processor: MessageProcessor,
+    default_channel: CollectingOutputChannel, default_processor: MessageProcessor
 ):
     sender_id = uuid.uuid4().hex
     tracker = await default_processor.get_tracker_with_session_start(
diff --git a/tests/core/test_restore.py b/tests/core/test_restore.py
index 5a1554e6f9ad..9d792da4b15e 100644
--- a/tests/core/test_restore.py
+++ b/tests/core/test_restore.py
@@ -17,9 +17,11 @@ async def test_restoring_tracker(trained_moodbot_path, recwarn):
 
     await restore.replay_events(tracker, agent)
 
-    # makes sure there are no warnings. warnings are raised, if the models
+    # makes sure there are no warnings.warnings are raised, if the models
     # predictions differ from the tracker when the dumped tracker is replayed
-    assert [e for e in recwarn if e._category_name == "UserWarning"] == []
+    # TODO tensorflow is printing a warning currently, should be resolved with an
+    #   upcoming version (https://github.com/tensorflow/tensorflow/issues/35100)
+    # assert [e for e in recwarn if e._category_name == "UserWarning"] == []
 
     assert len(tracker.events) == 7
     assert tracker.latest_action_name == "action_listen"
diff --git a/tests/core/test_structures.py b/tests/core/test_structures.py
index a6407b576bd9..ea688b3709e2 100644
--- a/tests/core/test_structures.py
+++ b/tests/core/test_structures.py
@@ -1,11 +1,6 @@
 from rasa.core.actions.action import ACTION_SESSION_START_NAME
 from rasa.core.domain import Domain
-from rasa.core.events import (
-    SessionStarted,
-    SlotSet,
-    UserUttered,
-    ActionExecuted,
-)
+from rasa.core.events import SessionStarted, SlotSet, UserUttered, ActionExecuted
 from rasa.core.trackers import DialogueStateTracker
 from rasa.core.training.structures import Story
 
diff --git a/tests/core/test_trackers.py b/tests/core/test_trackers.py
index 9091394ca283..81924c4b05a3 100644
--- a/tests/core/test_trackers.py
+++ b/tests/core/test_trackers.py
@@ -92,9 +92,7 @@ def test_tracker_store_storage_and_retrieval(store):
     assert tracker.sender_id == "some-id"
 
     # Action listen should be in there
-    assert list(tracker.events) == [
-        ActionExecuted(ACTION_LISTEN_NAME),
-    ]
+    assert list(tracker.events) == [ActionExecuted(ACTION_LISTEN_NAME)]
 
     # lets log a test message
     intent = {"name": "greet", "confidence": 1.0}
diff --git a/tests/core/test_training.py b/tests/core/test_training.py
index c2195f2991cd..c61847392073 100644
--- a/tests/core/test_training.py
+++ b/tests/core/test_training.py
@@ -120,6 +120,7 @@ def configs_for_random_seed_test():
 async def test_random_seed(tmpdir, config_file):
     # set random seed in config file to
     # generate a reproducible training result
+
     agent_1 = await train(
         DEFAULT_DOMAIN_PATH_WITH_SLOTS,
         DEFAULT_STORIES_FILE,
diff --git a/tests/nlu/base/test_config.py b/tests/nlu/base/test_config.py
deleted file mode 100644
index be729075adb3..000000000000
--- a/tests/nlu/base/test_config.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import json
-import tempfile
-from typing import Text
-
-import pytest
-
-import rasa.utils.io
-from rasa.nlu import config
-from rasa.nlu.components import ComponentBuilder
-from rasa.nlu.registry import registered_pipeline_templates
-from tests.nlu.conftest import CONFIG_DEFAULTS_PATH
-from tests.nlu.utilities import write_file_config
-
-defaults = rasa.utils.io.read_config_file(CONFIG_DEFAULTS_PATH)
-
-
-def test_default_config(default_config):
-    assert default_config.as_dict() == defaults
-
-
-def test_blank_config():
-    file_config = {}
-    f = write_file_config(file_config)
-    final_config = config.load(f.name)
-    assert final_config.as_dict() == defaults
-
-
-def test_invalid_config_json():
-    file_config = """pipeline: [pretrained_embeddings_spacy"""  # invalid yaml
-    with tempfile.NamedTemporaryFile("w+", suffix="_tmp_config_file.json") as f:
-        f.write(file_config)
-        f.flush()
-        with pytest.raises(config.InvalidConfigError):
-            config.load(f.name)
-
-
-def test_invalid_pipeline_template():
-    args = {"pipeline": "my_made_up_name"}
-    f = write_file_config(args)
-    with pytest.raises(config.InvalidConfigError) as execinfo:
-        config.load(f.name)
-    assert "unknown pipeline template" in str(execinfo.value)
-
-
-@pytest.mark.parametrize(
-    "pipeline_template", list(registered_pipeline_templates.keys())
-)
-def test_pipeline_registry_lookup(pipeline_template: Text):
-    args = {"pipeline": pipeline_template}
-    f = write_file_config(args)
-    final_config = config.load(f.name)
-    components = [c for c in final_config.pipeline]
-
-    assert json.dumps(components, sort_keys=True) == json.dumps(
-        registered_pipeline_templates[pipeline_template], sort_keys=True
-    )
-
-
-def test_default_config_file():
-    final_config = config.RasaNLUModelConfig()
-    assert len(final_config) > 1
-
-
-def test_set_attr_on_component():
-    cfg = config.load("sample_configs/config_pretrained_embeddings_spacy.yml")
-    cfg.set_component_attr(6, C=324)
-
-    assert cfg.for_component(1) == {"name": "SpacyTokenizer"}
-    assert cfg.for_component(6) == {"name": "SklearnIntentClassifier", "C": 324}
-
-
-def test_override_defaults_supervised_embeddings_pipeline():
-    cfg = config.load("data/test/config_embedding_test.yml")
-    builder = ComponentBuilder()
-
-    component1_cfg = cfg.for_component(0)
-
-    component1 = builder.create_component(component1_cfg, cfg)
-    assert component1.max_ngram == 3
-
-    component2_cfg = cfg.for_component(1)
-    component2 = builder.create_component(component2_cfg, cfg)
-    assert component2.epochs == 10
diff --git a/tests/nlu/base/test_emulators.py b/tests/nlu/base/test_emulators.py
deleted file mode 100644
index 2d21e966b909..000000000000
--- a/tests/nlu/base/test_emulators.py
+++ /dev/null
@@ -1,152 +0,0 @@
-def test_luis_request():
-    from rasa.nlu.emulators.luis import LUISEmulator
-
-    em = LUISEmulator()
-    norm = em.normalise_request_json({"text": ["arb text"]})
-    assert norm == {"text": "arb text", "time": None}
-
-
-def test_luis_response():
-    from rasa.nlu.emulators.luis import LUISEmulator
-
-    em = LUISEmulator()
-    data = {
-        "text": "I want italian food",
-        "intent": {"name": "restaurant_search", "confidence": 0.737014589341683},
-        "intent_ranking": [
-            {"confidence": 0.737014589341683, "name": "restaurant_search"},
-            {"confidence": 0.11605464483122209, "name": "goodbye"},
-            {"confidence": 0.08816417744097163, "name": "greet"},
-            {"confidence": 0.058766588386123204, "name": "affirm"},
-        ],
-        "entities": [{"entity": "cuisine", "value": "italian"}],
-    }
-    norm = em.normalise_response_json(data)
-    assert norm == {
-        "query": data["text"],
-        "topScoringIntent": {"intent": "restaurant_search", "score": 0.737014589341683},
-        "intents": [
-            {"intent": "restaurant_search", "score": 0.737014589341683},
-            {"intent": "goodbye", "score": 0.11605464483122209},
-            {"intent": "greet", "score": 0.08816417744097163},
-            {"intent": "affirm", "score": 0.058766588386123204},
-        ],
-        "entities": [
-            {
-                "entity": e["value"],
-                "type": e["entity"],
-                "startIndex": None,
-                "endIndex": None,
-                "score": None,
-            }
-            for e in data["entities"]
-        ],
-    }
-
-
-def test_wit_request():
-    from rasa.nlu.emulators.wit import WitEmulator
-
-    em = WitEmulator()
-    norm = em.normalise_request_json({"text": ["arb text"]})
-    assert norm == {"text": "arb text", "time": None}
-
-
-def test_wit_response():
-    from rasa.nlu.emulators.wit import WitEmulator
-
-    em = WitEmulator()
-    data = {
-        "text": "I want italian food",
-        "intent": {"name": "inform", "confidence": 0.4794813722432127},
-        "entities": [{"entity": "cuisine", "value": "italian", "start": 7, "end": 14}],
-    }
-    norm = em.normalise_response_json(data)
-    assert norm == [
-        {
-            "entities": {
-                "cuisine": {
-                    "confidence": None,
-                    "type": "value",
-                    "value": "italian",
-                    "start": 7,
-                    "end": 14,
-                }
-            },
-            "intent": "inform",
-            "_text": "I want italian food",
-            "confidence": 0.4794813722432127,
-        }
-    ]
-
-
-def test_dialogflow_request():
-    from rasa.nlu.emulators.dialogflow import DialogflowEmulator
-
-    em = DialogflowEmulator()
-    norm = em.normalise_request_json({"text": ["arb text"]})
-    assert norm == {"text": "arb text", "time": None}
-
-
-def test_dialogflow_response():
-    from rasa.nlu.emulators.dialogflow import DialogflowEmulator
-
-    em = DialogflowEmulator()
-    data = {
-        "text": "I want italian food",
-        "intent": {"name": "inform", "confidence": 0.4794813722432127},
-        "entities": [{"entity": "cuisine", "value": "italian", "start": 7, "end": 14}],
-    }
-    norm = em.normalise_response_json(data)
-
-    assert norm == {
-        "id": norm["id"],
-        "result": {
-            "action": data["intent"]["name"],
-            "actionIncomplete": False,
-            "contexts": [],
-            "fulfillment": {},
-            "metadata": {
-                "intentId": norm["result"]["metadata"]["intentId"],
-                "intentName": data["intent"]["name"],
-                "webhookUsed": "false",
-            },
-            "parameters": {"cuisine": ["italian"]},
-            "resolvedQuery": data["text"],
-            "score": data["intent"]["confidence"],
-            "source": "agent",
-        },
-        "sessionId": norm["sessionId"],
-        "status": {"code": 200, "errorType": "success"},
-        "timestamp": norm["timestamp"],
-    }
-
-
-def test_dummy_request():
-    from rasa.nlu.emulators.no_emulator import NoEmulator
-
-    em = NoEmulator()
-    norm = em.normalise_request_json({"text": ["arb text"]})
-    assert norm == {"text": "arb text", "time": None}
-
-    norm = em.normalise_request_json({"text": ["arb text"], "time": "1499279161658"})
-    assert norm == {"text": "arb text", "time": "1499279161658"}
-
-
-def test_dummy_response():
-    from rasa.nlu.emulators.no_emulator import NoEmulator
-
-    em = NoEmulator()
-    data = {"intent": "greet", "text": "hi", "entities": {}, "confidence": 1.0}
-    assert em.normalise_response_json(data) == data
-
-
-def test_emulators_can_handle_missing_data():
-    from rasa.nlu.emulators.luis import LUISEmulator
-
-    em = LUISEmulator()
-    norm = em.normalise_response_json(
-        {"text": "this data doesn't contain an intent result"}
-    )
-    assert norm["topScoringIntent"] is None
-    assert norm["intents"] == []
diff --git a/tests/nlu/base/test_synonyms.py b/tests/nlu/base/test_synonyms.py
deleted file mode 100644
index c9ef7c7eb58c..000000000000
--- a/tests/nlu/base/test_synonyms.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from rasa.nlu.extractors.entity_synonyms import EntitySynonymMapper
-from rasa.nlu.model import Metadata
-import pytest
-
-
-def test_entity_synonyms():
-    entities = [
-        {"entity": "test", "value": "chines", "start": 0, "end": 6},
-        {"entity": "test", "value": "chinese", "start": 0, "end": 6},
-        {"entity": "test", "value": "china", "start": 0, "end": 6},
-    ]
-    ent_synonyms = {"chines": "chinese", "NYC": "New York City"}
-    EntitySynonymMapper(synonyms=ent_synonyms).replace_synonyms(entities)
-    assert len(entities) == 3
-    assert entities[0]["value"] == "chinese"
-    assert entities[1]["value"] == "chinese"
-    assert entities[2]["value"] == "china"
diff --git a/tests/nlu/classifiers/test_embedding_intent_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py
similarity index 57%
rename from tests/nlu/classifiers/test_embedding_intent_classifier.py
rename to tests/nlu/classifiers/test_diet_classifier.py
index 4692955bd20f..4c38733607c2 100644
--- a/tests/nlu/classifiers/test_embedding_intent_classifier.py
+++ b/tests/nlu/classifiers/test_diet_classifier.py
@@ -1,19 +1,20 @@
 import numpy as np
 import pytest
-import scipy.sparse
 
 from unittest.mock import Mock
 
 from rasa.nlu import train
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.constants import (
-    TEXT_ATTRIBUTE,
-    SPARSE_FEATURE_NAMES,
-    DENSE_FEATURE_NAMES,
-    INTENT_ATTRIBUTE,
+from rasa.nlu.constants import TEXT, SPARSE_FEATURE_NAMES, DENSE_FEATURE_NAMES, INTENT
+from rasa.utils.tensorflow.constants import (
+    LOSS_TYPE,
+    RANDOM_SEED,
+    RANKING_LENGTH,
+    EPOCHS,
+    MASKED_LM,
 )
-from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
+from rasa.nlu.classifiers.diet_classifier import DIETClassifier
 from rasa.nlu.model import Interpreter
 from rasa.nlu.training_data import Message
 from rasa.utils import train_utils
@@ -28,7 +29,7 @@ def test_compute_default_label_features():
         Message("test d"),
     ]
 
-    output = EmbeddingIntentClassifier._compute_default_label_features(label_features)
+    output = DIETClassifier._compute_default_label_features(label_features)
 
     output = output[0]
 
@@ -38,37 +39,6 @@ def test_compute_default_label_features():
         assert o.shape == (1, len(label_features))
 
 
-def test_get_num_of_features():
-    session_data = {
-        "text_features": [
-            np.array(
-                [
-                    np.random.rand(5, 14),
-                    np.random.rand(2, 14),
-                    np.random.rand(3, 14),
-                    np.random.rand(1, 14),
-                    np.random.rand(3, 14),
-                ]
-            ),
-            np.array(
-                [
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(5, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(2, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(1, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
-                ]
-            ),
-        ]
-    }
-
-    num_features = EmbeddingIntentClassifier._get_num_of_features(
-        session_data, "text_features"
-    )
-
-    assert num_features == 24
-
-
 @pytest.mark.parametrize(
     "messages, expected",
     [
@@ -77,15 +47,15 @@ def test_get_num_of_features():
                 Message(
                     "test a",
                     data={
-                        SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]: np.zeros(1),
-                        DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]: np.zeros(1),
+                        SPARSE_FEATURE_NAMES[TEXT]: np.zeros(1),
+                        DENSE_FEATURE_NAMES[TEXT]: np.zeros(1),
                     },
                 ),
                 Message(
                     "test b",
                     data={
-                        SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]: np.zeros(1),
-                        DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]: np.zeros(1),
+                        SPARSE_FEATURE_NAMES[TEXT]: np.zeros(1),
+                        DENSE_FEATURE_NAMES[TEXT]: np.zeros(1),
                     },
                 ),
             ],
@@ -96,8 +66,8 @@ def test_get_num_of_features():
                 Message(
                     "test a",
                     data={
-                        SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]: np.zeros(1),
-                        DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE]: np.zeros(1),
+                        SPARSE_FEATURE_NAMES[INTENT]: np.zeros(1),
+                        DENSE_FEATURE_NAMES[INTENT]: np.zeros(1),
                     },
                 )
             ],
@@ -106,41 +76,50 @@ def test_get_num_of_features():
     ],
 )
 def test_check_labels_features_exist(messages, expected):
-    attribute = TEXT_ATTRIBUTE
-
-    assert (
-        EmbeddingIntentClassifier._check_labels_features_exist(messages, attribute)
-        == expected
-    )
+    attribute = TEXT
 
+    assert DIETClassifier._check_labels_features_exist(messages, attribute) == expected
 
-async def test_train(component_builder, tmpdir):
-    pipeline = [
-        {
-            "name": "ConveRTTokenizer",
-            "intent_tokenization_flag": True,
-            "intent_split_symbol": "+",
-        },
-        {"name": "CountVectorsFeaturizer"},
-        {"name": "ConveRTFeaturizer"},
-        {"name": "EmbeddingIntentClassifier"},
-    ]
 
+@pytest.mark.parametrize(
+    "pipeline",
+    [
+        [
+            {
+                "name": "ConveRTTokenizer",
+                "intent_tokenization_flag": True,
+                "intent_split_symbol": "+",
+            },
+            {"name": "CountVectorsFeaturizer"},
+            {"name": "ConveRTFeaturizer"},
+            {"name": "DIETClassifier", MASKED_LM: True, EPOCHS: 1},
+        ],
+        [
+            {"name": "WhitespaceTokenizer"},
+            {"name": "CountVectorsFeaturizer"},
+            {"name": "DIETClassifier", LOSS_TYPE: "margin", EPOCHS: 1},
+        ],
+    ],
+)
+async def test_train_persist_load_with_different_settings(
+    pipeline, component_builder, tmpdir
+):
     _config = RasaNLUModelConfig({"pipeline": pipeline, "language": "en"})
 
-    (trained, _, persisted_path) = await train(
+    (trainer, trained, persisted_path) = await train(
         _config,
         path=tmpdir.strpath,
         data="data/examples/rasa/demo-rasa-multi-intent.md",
         component_builder=component_builder,
     )
 
+    assert trainer.pipeline
     assert trained.pipeline
 
     loaded = Interpreter.load(persisted_path, component_builder)
+
     assert loaded.pipeline
-    assert loaded.parse("hello") is not None
-    assert loaded.parse("Hello today is Monday, again!") is not None
+    assert loaded.parse("Rasa is great!") == trained.parse("Rasa is great!")
 
 
 async def test_raise_error_on_incorrect_pipeline(component_builder, tmpdir):
@@ -150,7 +129,7 @@ async def test_raise_error_on_incorrect_pipeline(component_builder, tmpdir):
         {
             "pipeline": [
                 {"name": "WhitespaceTokenizer"},
-                {"name": "EmbeddingIntentClassifier"},
+                {"name": "DIETClassifier", EPOCHS: 1},
             ],
             "language": "en",
         }
@@ -165,8 +144,8 @@ async def test_raise_error_on_incorrect_pipeline(component_builder, tmpdir):
         )
 
     assert (
-        "Failed to validate component 'EmbeddingIntentClassifier'. Missing one of "
-        "the following properties: " in str(e.value)
+        "'DIETClassifier' requires ['Featurizer']. "
+        "Add required components to the pipeline." in str(e.value)
     )
 
 
@@ -177,27 +156,32 @@ def as_pipeline(*components):
 @pytest.mark.parametrize(
     "classifier_params, data_path, output_length, output_should_sum_to_1",
     [
-        ({"random_seed": 42}, "data/test/many_intents.md", 10, True),  # default config
         (
-            {"random_seed": 42, "ranking_length": 0},
+            {RANDOM_SEED: 42, EPOCHS: 1},
+            "data/test/many_intents.md",
+            10,
+            True,
+        ),  # default config
+        (
+            {RANDOM_SEED: 42, RANKING_LENGTH: 0, EPOCHS: 1},
             "data/test/many_intents.md",
             LABEL_RANKING_LENGTH,
             False,
         ),  # no normalization
         (
-            {"random_seed": 42, "ranking_length": 3},
+            {RANDOM_SEED: 42, RANKING_LENGTH: 3, EPOCHS: 1},
             "data/test/many_intents.md",
             3,
             True,
         ),  # lower than default ranking_length
         (
-            {"random_seed": 42, "ranking_length": 12},
+            {RANDOM_SEED: 42, RANKING_LENGTH: 12, EPOCHS: 1},
             "data/test/many_intents.md",
             LABEL_RANKING_LENGTH,
             False,
         ),  # higher than default ranking_length
         (
-            {"random_seed": 42},
+            {RANDOM_SEED: 42, EPOCHS: 1},
             "examples/moodbot/data/nlu.md",
             7,
             True,
@@ -213,9 +197,9 @@ async def test_softmax_normalization(
     output_should_sum_to_1,
 ):
     pipeline = as_pipeline(
-        "WhitespaceTokenizer", "CountVectorsFeaturizer", "EmbeddingIntentClassifier"
+        "WhitespaceTokenizer", "CountVectorsFeaturizer", "DIETClassifier"
     )
-    assert pipeline[2]["name"] == "EmbeddingIntentClassifier"
+    assert pipeline[2]["name"] == "DIETClassifier"
     pipeline[2].update(classifier_params)
 
     _config = RasaNLUModelConfig({"pipeline": pipeline})
@@ -244,15 +228,15 @@ async def test_softmax_normalization(
 
 @pytest.mark.parametrize(
     "classifier_params, output_length",
-    [({"loss_type": "margin", "random_seed": 42}, LABEL_RANKING_LENGTH)],
+    [({LOSS_TYPE: "margin", RANDOM_SEED: 42, EPOCHS: 1}, LABEL_RANKING_LENGTH)],
 )
 async def test_margin_loss_is_not_normalized(
     monkeypatch, component_builder, tmpdir, classifier_params, output_length
 ):
     pipeline = as_pipeline(
-        "WhitespaceTokenizer", "CountVectorsFeaturizer", "EmbeddingIntentClassifier"
+        "WhitespaceTokenizer", "CountVectorsFeaturizer", "DIETClassifier"
     )
-    assert pipeline[2]["name"] == "EmbeddingIntentClassifier"
+    assert pipeline[2]["name"] == "DIETClassifier"
     pipeline[2].update(classifier_params)
 
     mock = Mock()
@@ -280,27 +264,39 @@ async def test_margin_loss_is_not_normalized(
     assert parse_data.get("intent") == intent_ranking[0]
 
 
-@pytest.mark.parametrize(
-    "session_data, expected",
-    [
-        (
-            {
-                "text_features": [
-                    np.array(
-                        [
-                            np.random.rand(5, 14),
-                            np.random.rand(2, 14),
-                            np.random.rand(3, 14),
-                        ]
-                    )
-                ]
-            },
-            True,
-        ),
-        ({"text_features": [np.array([0, 0, 0])]}, False),
-        ({"text_features": [scipy.sparse.csr_matrix([0, 0, 0])]}, False),
-        ({"text_features": [scipy.sparse.csr_matrix([0, 31, 0])]}, True),
-    ],
-)
-def test_text_features_present(session_data, expected):
-    assert EmbeddingIntentClassifier._text_features_present(session_data) == expected
+async def test_set_random_seed(component_builder, tmpdir):
+    """test if train result is the same for two runs of tf embedding"""
+
+    # set fixed random seed
+    _config = RasaNLUModelConfig(
+        {
+            "pipeline": [
+                {"name": "WhitespaceTokenizer"},
+                {"name": "CountVectorsFeaturizer"},
+                {"name": "DIETClassifier", RANDOM_SEED: 1, EPOCHS: 1},
+            ],
+            "language": "en",
+        }
+    )
+
+    # first run
+    (trained_a, _, persisted_path_a) = await train(
+        _config,
+        path=tmpdir.strpath + "_a",
+        data=DEFAULT_DATA_PATH,
+        component_builder=component_builder,
+    )
+    # second run
+    (trained_b, _, persisted_path_b) = await train(
+        _config,
+        path=tmpdir.strpath + "_b",
+        data=DEFAULT_DATA_PATH,
+        component_builder=component_builder,
+    )
+
+    loaded_a = Interpreter.load(persisted_path_a, component_builder)
+    loaded_b = Interpreter.load(persisted_path_b, component_builder)
+    result_a = loaded_a.parse("hello")["intent"]["confidence"]
+    result_b = loaded_b.parse("hello")["intent"]["confidence"]
+
+    assert result_a == result_b
diff --git a/tests/nlu/base/test_classifiers.py b/tests/nlu/classifiers/test_keyword_classifier.py
similarity index 97%
rename from tests/nlu/base/test_classifiers.py
rename to tests/nlu/classifiers/test_keyword_classifier.py
index b0248f19879d..e5101c93f939 100644
--- a/tests/nlu/base/test_classifiers.py
+++ b/tests/nlu/classifiers/test_keyword_classifier.py
@@ -1,13 +1,10 @@
 import pytest
 import copy
-import logging
 
 from rasa.nlu.classifiers.keyword_intent_classifier import KeywordIntentClassifier
 
 # TODO: add tests for other classifers
-# from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
 # from rasa.nlu.classifiers.mitie_intent_classifier import MitieIntentClassifier
-# from rasa.nlu.classifiers.sklearn_intent_classifier import SklearnIntentClassifier
 from rasa.nlu.training_data.formats.rasa import RasaReader
 from rasa.nlu.training_data import load_data
 from rasa.nlu.training_data.message import Message
diff --git a/tests/nlu/conftest.py b/tests/nlu/conftest.py
index 21588dac0d2b..9c644930e5bd 100644
--- a/tests/nlu/conftest.py
+++ b/tests/nlu/conftest.py
@@ -1,21 +1,14 @@
-import logging
-import os
+from typing import Text
 
 import pytest
 
-from rasa.nlu import config, train
+from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.components import ComponentBuilder
-
-CONFIG_DEFAULTS_PATH = "sample_configs/config_defaults.yml"
-
-NLU_DEFAULT_CONFIG_PATH = "sample_configs/config_pretrained_embeddings_mitie.yml"
+from rasa.utils.tensorflow.constants import EPOCHS, RANDOM_SEED
+from tests.nlu.utilities import write_file_config
 
 DEFAULT_DATA_PATH = "data/examples/rasa/demo-rasa.json"
 
-NLU_MODEL_NAME = "nlu_model.tar.gz"
-
-MOODBOT_MODEL_PATH = "examples/moodbot/models/"
-
 
 @pytest.fixture(scope="session")
 def component_builder():
@@ -23,47 +16,93 @@ def component_builder():
 
 
 @pytest.fixture(scope="session")
-def spacy_nlp(component_builder, default_config):
+def spacy_nlp(component_builder, blank_config):
     spacy_nlp_config = {"name": "SpacyNLP"}
-    return component_builder.create_component(spacy_nlp_config, default_config).nlp
+    return component_builder.create_component(spacy_nlp_config, blank_config).nlp
 
 
 @pytest.fixture(scope="session")
-def spacy_nlp_component(component_builder, default_config):
+def spacy_nlp_component(component_builder, blank_config):
     spacy_nlp_config = {"name": "SpacyNLP"}
-    return component_builder.create_component(spacy_nlp_config, default_config)
+    return component_builder.create_component(spacy_nlp_config, blank_config)
 
 
 @pytest.fixture(scope="session")
-def ner_crf_pos_feature_config():
-    return {
-        "features": [
-            ["low", "title", "upper", "pos", "pos2"],
-            [
-                "bias",
-                "low",
-                "suffix3",
-                "suffix2",
-                "upper",
-                "title",
-                "digit",
-                "pos",
-                "pos2",
-                "pattern",
-            ],
-            ["low", "title", "upper", "pos", "pos2"],
-        ]
-    }
+def mitie_feature_extractor(component_builder: ComponentBuilder, blank_config):
+    mitie_nlp_config = {"name": "MitieNLP"}
+    return component_builder.create_component(mitie_nlp_config, blank_config).extractor
 
 
 @pytest.fixture(scope="session")
-def mitie_feature_extractor(component_builder, default_config):
-    mitie_nlp_config = {"name": "MitieNLP"}
-    return component_builder.create_component(
-        mitie_nlp_config, default_config
-    ).extractor
+def blank_config() -> RasaNLUModelConfig:
+    return RasaNLUModelConfig({"language": "en", "pipeline": []})
 
 
 @pytest.fixture(scope="session")
-def default_config():
-    return config.load(CONFIG_DEFAULTS_PATH)
+def config_path() -> Text:
+    return write_file_config(
+        {
+            "language": "en",
+            "pipeline": [
+                {"name": "WhitespaceTokenizer"},
+                {"name": "CRFEntityExtractor", EPOCHS: 1, RANDOM_SEED: 42},
+                {"name": "CountVectorsFeaturizer"},
+                {"name": "EmbeddingIntentClassifier", EPOCHS: 1, RANDOM_SEED: 42},
+            ],
+        }
+    ).name
+
+
+@pytest.fixture()
+def pretrained_embeddings_spacy_config() -> RasaNLUModelConfig:
+    return RasaNLUModelConfig(
+        {
+            "language": "en",
+            "pipeline": [
+                {"name": "SpacyNLP"},
+                {"name": "SpacyTokenizer"},
+                {"name": "SpacyFeaturizer"},
+                {"name": "RegexFeaturizer"},
+                {"name": "CRFEntityExtractor", EPOCHS: 1, RANDOM_SEED: 42},
+                {"name": "EntitySynonymMapper"},
+                {"name": "SklearnIntentClassifier"},
+            ],
+        }
+    )
+
+
+@pytest.fixture()
+def supervised_embeddings_config() -> RasaNLUModelConfig:
+    return RasaNLUModelConfig(
+        {
+            "language": "en",
+            "pipeline": [
+                {"name": "WhitespaceTokenizer"},
+                {"name": "RegexFeaturizer"},
+                {"name": "CRFEntityExtractor", EPOCHS: 1, RANDOM_SEED: 42},
+                {"name": "EntitySynonymMapper"},
+                {"name": "CountVectorsFeaturizer"},
+                {
+                    "name": "CountVectorsFeaturizer",
+                    "analyzer": "char_wb",
+                    "min_ngram": 1,
+                    "max_ngram": 4,
+                },
+                {"name": "EmbeddingIntentClassifier", EPOCHS: 1, RANDOM_SEED: 42},
+            ],
+        }
+    )
+
+
+@pytest.fixture()
+def pretrained_embeddings_convert_config() -> RasaNLUModelConfig:
+    return RasaNLUModelConfig(
+        {
+            "language": "en",
+            "pipeline": [
+                {"name": "ConveRTTokenizer"},
+                {"name": "ConveRTFeaturizer"},
+                {"name": "EmbeddingIntentClassifier", EPOCHS: 1, RANDOM_SEED: 42},
+            ],
+        }
+    )
diff --git a/tests/nlu/emulators/__init__.py b/tests/nlu/emulators/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/nlu/emulators/test_dialogflow.py b/tests/nlu/emulators/test_dialogflow.py
new file mode 100644
index 000000000000..76abc0ee0080
--- /dev/null
+++ b/tests/nlu/emulators/test_dialogflow.py
@@ -0,0 +1,40 @@
+def test_dialogflow_request():
+    from rasa.nlu.emulators.dialogflow import DialogflowEmulator
+
+    em = DialogflowEmulator()
+    norm = em.normalise_request_json({"text": ["arb text"]})
+    assert norm == {"text": "arb text", "time": None}
+
+
+def test_dialogflow_response():
+    from rasa.nlu.emulators.dialogflow import DialogflowEmulator
+
+    em = DialogflowEmulator()
+    data = {
+        "text": "I want italian food",
+        "intent": {"name": "inform", "confidence": 0.4794813722432127},
+        "entities": [{"entity": "cuisine", "value": "italian", "start": 7, "end": 14}],
+    }
+    norm = em.normalise_response_json(data)
+
+    assert norm == {
+        "id": norm["id"],
+        "result": {
+            "action": data["intent"]["name"],
+            "actionIncomplete": False,
+            "contexts": [],
+            "fulfillment": {},
+            "metadata": {
+                "intentId": norm["result"]["metadata"]["intentId"],
+                "intentName": data["intent"]["name"],
+                "webhookUsed": "false",
+            },
+            "parameters": {"cuisine": ["italian"]},
+            "resolvedQuery": data["text"],
+            "score": data["intent"]["confidence"],
+            "source": "agent",
+        },
+        "sessionId": norm["sessionId"],
+        "status": {"code": 200, "errorType": "success"},
+        "timestamp": norm["timestamp"],
+    }
diff --git a/tests/nlu/emulators/test_luis.py b/tests/nlu/emulators/test_luis.py
new file mode 100644
index 000000000000..5c2cad97e1ba
--- /dev/null
+++ b/tests/nlu/emulators/test_luis.py
@@ -0,0 +1,44 @@
+def test_luis_request():
+    from rasa.nlu.emulators.luis import LUISEmulator
+
+    em = LUISEmulator()
+    norm = em.normalise_request_json({"text": ["arb text"]})
+    assert norm == {"text": "arb text", "time": None}
+
+
+def test_luis_response():
+    from rasa.nlu.emulators.luis import LUISEmulator
+
+    em = LUISEmulator()
+    data = {
+        "text": "I want italian food",
+        "intent": {"name": "restaurant_search", "confidence": 0.737014589341683},
+        "intent_ranking": [
+            {"confidence": 0.737014589341683, "name": "restaurant_search"},
+            {"confidence": 0.11605464483122209, "name": "goodbye"},
+            {"confidence": 0.08816417744097163, "name": "greet"},
+            {"confidence": 0.058766588386123204, "name": "affirm"},
+        ],
+        "entities": [{"entity": "cuisine", "value": "italian"}],
+    }
+    norm = em.normalise_response_json(data)
+    assert norm == {
+        "query": data["text"],
+        "topScoringIntent": {"intent": "restaurant_search", "score": 0.737014589341683},
+        "intents": [
+            {"intent": "restaurant_search", "score": 0.737014589341683},
+            {"intent": "goodbye", "score": 0.11605464483122209},
+            {"intent": "greet", "score": 0.08816417744097163},
+            {"intent": "affirm", "score": 0.058766588386123204},
+        ],
+        "entities": [
+            {
+                "entity": e["value"],
+                "type": e["entity"],
+                "startIndex": None,
+                "endIndex": None,
+                "score": None,
+            }
+            for e in data["entities"]
+        ],
+    }
diff --git a/tests/nlu/emulators/test_no_emulator.py b/tests/nlu/emulators/test_no_emulator.py
new file mode 100644
index 000000000000..cc40b3ae8390
--- /dev/null
+++ b/tests/nlu/emulators/test_no_emulator.py
@@ -0,0 +1,28 @@
+def test_dummy_request():
+    from rasa.nlu.emulators.no_emulator import NoEmulator
+
+    em = NoEmulator()
+    norm = em.normalise_request_json({"text": ["arb text"]})
+    assert norm == {"text": "arb text", "time": None}
+
+    norm = em.normalise_request_json({"text": ["arb text"], "time": "1499279161658"})
+    assert norm == {"text": "arb text", "time": "1499279161658"}
+
+
+def test_dummy_response():
+    from rasa.nlu.emulators.no_emulator import NoEmulator
+
+    em = NoEmulator()
+    data = {"intent": "greet", "text": "hi", "entities": {}, "confidence": 1.0}
+    assert em.normalise_response_json(data) == data
+
+
+def test_emulators_can_handle_missing_data():
+    from rasa.nlu.emulators.luis import LUISEmulator
+
+    em = LUISEmulator()
+    norm = em.normalise_response_json(
+        {"text": "this data doesn't contain an intent result"}
+    )
+    assert norm["topScoringIntent"] is None
+    assert norm["intents"] == []
diff --git a/tests/nlu/emulators/test_wit.py b/tests/nlu/emulators/test_wit.py
new file mode 100644
index 000000000000..069caa26f27f
--- /dev/null
+++ b/tests/nlu/emulators/test_wit.py
@@ -0,0 +1,34 @@
+def test_wit_request():
+    from rasa.nlu.emulators.wit import WitEmulator
+
+    em = WitEmulator()
+    norm = em.normalise_request_json({"text": ["arb text"]})
+    assert norm == {"text": "arb text", "time": None}
+
+
+def test_wit_response():
+    from rasa.nlu.emulators.wit import WitEmulator
+
+    em = WitEmulator()
+    data = {
+        "text": "I want italian food",
+        "intent": {"name": "inform", "confidence": 0.4794813722432127},
+        "entities": [{"entity": "cuisine", "value": "italian", "start": 7, "end": 14}],
+    }
+    norm = em.normalise_response_json(data)
+    assert norm == [
+        {
+            "entities": {
+                "cuisine": {
+                    "confidence": None,
+                    "type": "value",
+                    "value": "italian",
+                    "start": 7,
+                    "end": 14,
+                }
+            },
+            "intent": "inform",
+            "_text": "I want italian food",
+            "confidence": 0.4794813722432127,
+        }
+    ]
diff --git a/tests/nlu/example_component.py b/tests/nlu/example_component.py
index fea264ca9996..67a47e477b88 100644
--- a/tests/nlu/example_component.py
+++ b/tests/nlu/example_component.py
@@ -1,6 +1,9 @@
-from rasa.nlu.components import Component
 import typing
-from typing import Any, Optional, Text, Dict
+from typing import Any, Optional, Text, Dict, List, Type
+
+from rasa.nlu.components import Component
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.training_data import Message, TrainingData
 
 if typing.TYPE_CHECKING:
     from rasa.nlu.model import Metadata
@@ -9,21 +12,13 @@
 class MyComponent(Component):
     """A new component"""
 
-    # Defines what attributes the pipeline component will
-    # provide when called. The listed attributes
-    # should be set by the component on the message object
-    # during test and train, e.g.
-    # ```message.set("entities", [...])```
-    provides = []
-
-    # Which attributes on a message are required by this
-    # component. E.g. if requires contains "tokens", than a
-    # previous component in the pipeline needs to have "tokens"
-    # within the above described `provides` property.
-    # Use `any_of("option_1", "option_2")` to define that either
-    # "option_1" or "option_2" needs to be present in the
-    # provided properties from the previous components.
-    requires = []
+    # Which components are required by this component.
+    # Listed components should appear before the component itself in the pipeline.
+    @classmethod
+    def required_components(cls) -> List[Type[Component]]:
+        """Specify which components need to be present in the pipeline."""
+
+        return []
 
     # Defines the default configuration parameters of a component
     # these values can be overwritten in the pipeline configuration
@@ -37,10 +32,15 @@ class MyComponent(Component):
     # This is an important feature for backwards compatibility of components.
     language_list = None
 
-    def __init__(self, component_config=None):
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         super().__init__(component_config)
 
-    def train(self, training_data, cfg, **kwargs):
+    def train(
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
+    ) -> None:
         """Train this component.
 
         This is the components chance to train itself provided
@@ -53,7 +53,7 @@ def train(self, training_data, cfg, **kwargs):
         of components previous to this one."""
         pass
 
-    def process(self, message, **kwargs):
+    def process(self, message: Message, **kwargs: Any) -> None:
         """Process an incoming message.
 
         This is the components chance to process an incoming
diff --git a/tests/nlu/extractors/text_crf_entity_extractor.py b/tests/nlu/extractors/test_crf_entity_extractor.py
similarity index 63%
rename from tests/nlu/extractors/text_crf_entity_extractor.py
rename to tests/nlu/extractors/test_crf_entity_extractor.py
index 1ff19ba338de..b2cebd0ef42a 100644
--- a/tests/nlu/extractors/text_crf_entity_extractor.py
+++ b/tests/nlu/extractors/test_crf_entity_extractor.py
@@ -1,11 +1,12 @@
-from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.training_data import TrainingData, Message
+from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
+from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
+from rasa.nlu.constants import TEXT, SPACY_DOCS, ENTITIES
+from rasa.nlu.training_data import Message, TrainingData
+from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
 
 
-def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
-    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
-
-    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
+def test_crf_extractor(spacy_nlp):
     examples = [
         Message(
             "anywhere in the west",
@@ -14,7 +15,7 @@ def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
                 "entities": [
                     {"start": 16, "end": 20, "value": "west", "entity": "location"}
                 ],
-                "spacy_doc": spacy_nlp("anywhere in the west"),
+                SPACY_DOCS[TEXT]: spacy_nlp("anywhere in the west"),
             },
         ),
         Message(
@@ -37,53 +38,69 @@ def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
                         "extractor": "CRFEntityExtractor",
                     },
                 ],
-                "spacy_doc": spacy_nlp("central indian restaurant"),
+                SPACY_DOCS[TEXT]: spacy_nlp("central indian restaurant"),
             },
         ),
     ]
 
-    # uses BILOU and the default features
-    ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig())
-    sentence = "anywhere in the west"
-    doc = {"spacy_doc": spacy_nlp(sentence)}
-    crf_format = ext._from_text_to_crf(Message(sentence, doc))
-    assert [word[0] for word in crf_format] == ["anywhere", "in", "the", "west"]
-    feats = ext._sentence_to_features(crf_format)
-    assert "BOS" in feats[0]
-    assert "EOS" in feats[-1]
-    assert feats[1]["0:low"] == "in"
-    sentence = "anywhere in the west"
-    ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
-    filtered = ext.filter_trainable_entities(examples)
-    assert filtered[0].get("entities") == [
-        {"start": 16, "end": 20, "value": "west", "entity": "location"}
-    ], "Entity without extractor remains"
-    assert filtered[1].get("entities") == [
-        {
-            "start": 8,
-            "end": 14,
-            "value": "indian",
-            "entity": "cuisine",
-            "extractor": "CRFEntityExtractor",
+    extractor = CRFEntityExtractor(
+        component_config={
+            "features": [
+                ["low", "title", "upper", "pos", "pos2"],
+                ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"],
+                ["low", "title", "upper", "pos", "pos2"],
+            ]
         }
-    ], "Only CRFEntityExtractor entity annotation remains"
-    assert examples[1].get("entities")[0] == {
-        "start": 0,
-        "end": 7,
-        "value": "central",
-        "entity": "location",
-        "extractor": "random_extractor",
-    }, "Original examples are not mutated"
+    )
+    tokenizer = SpacyTokenizer()
 
+    training_data = TrainingData(training_examples=examples)
+    tokenizer.train(training_data)
+    extractor.train(training_data)
 
-def test_crf_json_from_BILOU(spacy_nlp, ner_crf_pos_feature_config):
-    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
+    sentence = "italian restaurant"
+    message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)})
+
+    tokenizer.process(message)
+    extractor.process(message)
+
+    detected_entities = message.get(ENTITIES)
+
+    assert len(detected_entities) == 1
+    assert detected_entities[0]["entity"] == "cuisine"
+    assert detected_entities[0]["value"] == "italian"
+
+
+def test_crf_json_from_BILOU(spacy_nlp):
+    ext = CRFEntityExtractor(
+        component_config={
+            "features": [
+                ["low", "title", "upper", "pos", "pos2"],
+                [
+                    "low",
+                    "bias",
+                    "suffix3",
+                    "suffix2",
+                    "upper",
+                    "title",
+                    "digit",
+                    "pos",
+                    "pos2",
+                ],
+                ["low", "title", "upper", "pos", "pos2"],
+            ]
+        }
+    )
 
-    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
     sentence = "I need a home cleaning close-by"
-    doc = {"spacy_doc": spacy_nlp(sentence)}
+
+    message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)})
+
+    tokenizer = SpacyTokenizer()
+    tokenizer.process(message)
+
     r = ext._from_crf_to_json(
-        Message(sentence, doc),
+        message,
         [
             {"O": 1.0},
             {"O": 1.0},
@@ -106,15 +123,28 @@ def test_crf_json_from_BILOU(spacy_nlp, ner_crf_pos_feature_config):
     assert r[1] == {"start": 23, "end": 31, "value": "close-by", "entity": "where"}
 
 
-def test_crf_json_from_non_BILOU(spacy_nlp, ner_crf_pos_feature_config):
+def test_crf_json_from_non_BILOU(spacy_nlp):
     from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
 
-    ner_crf_pos_feature_config.update({"BILOU_flag": False})
-    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
+    ext = CRFEntityExtractor(
+        component_config={
+            "BILOU_flag": False,
+            "features": [
+                ["low", "title", "upper", "pos", "pos2"],
+                ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"],
+                ["low", "title", "upper", "pos", "pos2"],
+            ],
+        }
+    )
     sentence = "I need a home cleaning close-by"
-    doc = {"spacy_doc": spacy_nlp(sentence)}
+
+    message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)})
+
+    tokenizer = SpacyTokenizer()
+    tokenizer.process(message)
+
     rs = ext._from_crf_to_json(
-        Message(sentence, doc),
+        message,
         [
             {"O": 1.0},
             {"O": 1.0},
@@ -142,10 +172,6 @@ def test_crf_json_from_non_BILOU(spacy_nlp, ner_crf_pos_feature_config):
 
 
 def test_crf_create_entity_dict(spacy_nlp):
-    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
-    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
-    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
-
     crf_extractor = CRFEntityExtractor()
     spacy_tokenizer = SpacyTokenizer()
     white_space_tokenizer = WhitespaceTokenizer()
@@ -172,6 +198,7 @@ def test_crf_create_entity_dict(spacy_nlp):
                             },
                         }
                     ],
+                    SPACY_DOCS[TEXT]: spacy_nlp("where is St. Michael's Hospital?"),
                 },
             )
         },
@@ -196,14 +223,15 @@ def test_crf_create_entity_dict(spacy_nlp):
                             },
                         }
                     ],
+                    SPACY_DOCS[TEXT]: spacy_nlp("where is Children's Hospital?"),
                 },
             )
         },
     ]
     for ex in examples:
         # spacy tokenizers receives a Doc as input and whitespace tokenizer receives a text
-        spacy_tokens = spacy_tokenizer.tokenize(spacy_nlp(ex["message"].text))
-        white_space_tokens = white_space_tokenizer.tokenize(ex["message"].text)
+        spacy_tokens = spacy_tokenizer.tokenize(ex["message"], TEXT)
+        white_space_tokens = white_space_tokenizer.tokenize(ex["message"], TEXT)
         for tokenizer, tokens in [
             ("SpacyTokenizer", spacy_tokens),
             ("WhitespaceTokenizer", white_space_tokens),
@@ -226,22 +254,35 @@ def test_crf_create_entity_dict(spacy_nlp):
                 }
 
 
-def test_crf_use_dense_features(ner_crf_pos_feature_config, spacy_nlp):
-    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
-    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
-    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
-
-    ner_crf_pos_feature_config["features"][1].append("text_dense_features")
-    crf_extractor = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
+def test_crf_use_dense_features(spacy_nlp):
+    crf_extractor = CRFEntityExtractor(
+        component_config={
+            "features": [
+                ["low", "title", "upper", "pos", "pos2"],
+                [
+                    "low",
+                    "suffix3",
+                    "suffix2",
+                    "upper",
+                    "title",
+                    "digit",
+                    "pos",
+                    "pos2",
+                    "text_dense_features",
+                ],
+                ["low", "title", "upper", "pos", "pos2"],
+            ]
+        }
+    )
 
     spacy_featurizer = SpacyFeaturizer()
-    white_space_tokenizer = WhitespaceTokenizer({"use_cls_token": False})
+    spacy_tokenizer = SpacyTokenizer()
 
     text = "Rasa is a company in Berlin"
     message = Message(text)
-    message.set("spacy_doc", spacy_nlp(text))
+    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))
 
-    white_space_tokenizer.process(message)
+    spacy_tokenizer.process(message)
     spacy_featurizer.process(message)
 
     text_data = crf_extractor._from_text_to_crf(message)
diff --git a/tests/nlu/extractors/test_entity_synonyms.py b/tests/nlu/extractors/test_entity_synonyms.py
index a6479b8f8b43..3d5230166b79 100644
--- a/tests/nlu/extractors/test_entity_synonyms.py
+++ b/tests/nlu/extractors/test_entity_synonyms.py
@@ -1,10 +1,32 @@
+from rasa.nlu.extractors.entity_synonyms import EntitySynonymMapper
 from rasa.nlu.training_data import TrainingData, Message
-from tests.nlu import utilities
 
 
-def test_unintentional_synonyms_capitalized(component_builder):
-    _config = utilities.base_test_conf("pretrained_embeddings_spacy")
-    ner_syn = component_builder.create_component(_config.for_component(5), _config)
+def test_entity_synonyms():
+    entities = [
+        {"entity": "test", "value": "chines", "start": 0, "end": 6},
+        {"entity": "test", "value": "chinese", "start": 0, "end": 6},
+        {"entity": "test", "value": "china", "start": 0, "end": 6},
+    ]
+    ent_synonyms = {"chines": "chinese", "NYC": "New York City"}
+    EntitySynonymMapper(synonyms=ent_synonyms).replace_synonyms(entities)
+    assert len(entities) == 3
+    assert entities[0]["value"] == "chinese"
+    assert entities[1]["value"] == "chinese"
+    assert entities[2]["value"] == "china"
+
+
+def test_unintentional_synonyms_capitalized(
+    component_builder, pretrained_embeddings_spacy_config
+):
+    idx = pretrained_embeddings_spacy_config.component_names.index(
+        "EntitySynonymMapper"
+    )
+    ner_syn = component_builder.create_component(
+        pretrained_embeddings_spacy_config.for_component(idx),
+        pretrained_embeddings_spacy_config,
+    )
+
     examples = [
         Message(
             "Any Mexican restaurant will do",
@@ -25,6 +47,10 @@ def test_unintentional_synonyms_capitalized(component_builder):
             },
         ),
     ]
-    ner_syn.train(TrainingData(training_examples=examples), _config)
+
+    ner_syn.train(
+        TrainingData(training_examples=examples), pretrained_embeddings_spacy_config
+    )
+
     assert ner_syn.synonyms.get("mexican") is None
     assert ner_syn.synonyms.get("tacos") == "Mexican"
diff --git a/tests/nlu/extractors/test_spacy_entity_extractors.py b/tests/nlu/extractors/test_spacy_entity_extractors.py
index 0c5e59ae5b7a..f417ae0f422d 100644
--- a/tests/nlu/extractors/test_spacy_entity_extractors.py
+++ b/tests/nlu/extractors/test_spacy_entity_extractors.py
@@ -10,7 +10,7 @@ def test_spacy_ner_extractor(component_builder, spacy_nlp):
         {
             "intent": "restaurant_search",
             "entities": [],
-            "spacy_doc": spacy_nlp("anywhere in the west"),
+            "text_spacy_doc": spacy_nlp("anywhere in the west"),
         },
     )
 
@@ -33,7 +33,7 @@ def test_spacy_ner_extractor(component_builder, spacy_nlp):
         {
             "intent": "example_intent",
             "entities": [],
-            "spacy_doc": spacy_nlp("anywhere in the West with Sebastian Thrun"),
+            "text_spacy_doc": spacy_nlp("anywhere in the West with Sebastian Thrun"),
         },
     )
     _config = RasaNLUModelConfig({"pipeline": [{"name": "SpacyEntityExtractor"}]})
diff --git a/tests/nlu/featurizers/test_convert_featurizer.py b/tests/nlu/featurizers/test_convert_featurizer.py
index a16bf6597156..c9dc0f47b3ba 100644
--- a/tests/nlu/featurizers/test_convert_featurizer.py
+++ b/tests/nlu/featurizers/test_convert_featurizer.py
@@ -5,11 +5,11 @@
 from rasa.nlu.training_data import TrainingData
 from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
 from rasa.nlu.constants import (
-    TEXT_ATTRIBUTE,
+    TEXT,
     DENSE_FEATURE_NAMES,
     TOKENS_NAMES,
-    RESPONSE_ATTRIBUTE,
-    INTENT_ATTRIBUTE,
+    RESPONSE,
+    INTENT,
 )
 from rasa.nlu.training_data import Message
 from rasa.nlu.config import RasaNLUModelConfig
@@ -21,9 +21,9 @@ def test_convert_featurizer_process():
 
     sentence = "Hey how are you today ?"
     message = Message(sentence)
-    tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT_ATTRIBUTE)
-    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT_ATTRIBUTE)
-    message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens)
+    tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT)
+    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT)
+    message.set(TOKENS_NAMES[TEXT], tokens)
 
     featurizer.process(message)
 
@@ -32,7 +32,7 @@ def test_convert_featurizer_process():
         [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]
     )
 
-    vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
 
     assert len(tokens) == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
@@ -44,11 +44,11 @@ def test_convert_featurizer_train():
 
     sentence = "Hey how are you today ?"
     message = Message(sentence)
-    message.set(RESPONSE_ATTRIBUTE, sentence)
-    tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT_ATTRIBUTE)
-    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT_ATTRIBUTE)
-    message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens)
-    message.set(TOKENS_NAMES[RESPONSE_ATTRIBUTE], tokens)
+    message.set(RESPONSE, sentence)
+    tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT)
+    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT)
+    message.set(TOKENS_NAMES[TEXT], tokens)
+    message.set(TOKENS_NAMES[RESPONSE], tokens)
 
     featurizer.train(TrainingData([message]), RasaNLUModelConfig())
 
@@ -57,19 +57,19 @@ def test_convert_featurizer_train():
         [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]
     )
 
-    vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
 
     assert len(tokens) == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
     assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE])
 
     assert len(tokens) == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
     assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[INTENT])
 
     assert vecs is None
 
@@ -85,7 +85,7 @@ def test_convert_featurizer_train():
     ],
 )
 def test_convert_featurizer_tokens_to_text(sentence, expected_text):
-    tokens = ConveRTTokenizer().tokenize(Message(sentence), attribute=TEXT_ATTRIBUTE)
+    tokens = ConveRTTokenizer().tokenize(Message(sentence), attribute=TEXT)
 
     actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0]
 
diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py
index b65c29f7cd92..a2b9d6c47a4d 100644
--- a/tests/nlu/featurizers/test_count_vectors_featurizer.py
+++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py
@@ -7,10 +7,10 @@
 from rasa.nlu.constants import (
     CLS_TOKEN,
     TOKENS_NAMES,
-    TEXT_ATTRIBUTE,
-    INTENT_ATTRIBUTE,
+    TEXT,
+    INTENT,
     SPARSE_FEATURE_NAMES,
-    RESPONSE_ATTRIBUTE,
+    RESPONSE,
 )
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message
@@ -43,10 +43,10 @@ def test_count_vector_featurizer(sentence, expected, expected_cls):
     ftr.process(test_message)
 
     assert isinstance(
-        test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]), scipy.sparse.coo_matrix
+        test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix
     )
 
-    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()
+    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()
 
     assert np.all(actual[0] == expected)
     assert np.all(actual[-1] == expected_cls)
@@ -64,14 +64,14 @@ def test_count_vector_featurizer_response_attribute_featurization(
 
     train_message = Message(sentence)
     # this is needed for a valid training example
-    train_message.set(INTENT_ATTRIBUTE, intent)
-    train_message.set(RESPONSE_ATTRIBUTE, response)
+    train_message.set(INTENT, intent)
+    train_message.set(RESPONSE, response)
 
     # add a second example that has some response, so that the vocabulary for
     # response exists
     second_message = Message("hello")
-    second_message.set(RESPONSE_ATTRIBUTE, "hi")
-    second_message.set(INTENT_ATTRIBUTE, "greet")
+    second_message.set(RESPONSE, "hi")
+    second_message.set(INTENT, "greet")
 
     data = TrainingData([train_message, second_message])
 
@@ -80,19 +80,19 @@ def test_count_vector_featurizer_response_attribute_featurization(
 
     if intent_features:
         assert (
-            train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]).toarray()[0]
+            train_message.get(SPARSE_FEATURE_NAMES[INTENT]).toarray()[0]
             == intent_features
         )
     else:
-        assert train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) is None
+        assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None
 
     if response_features:
         assert (
-            train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).toarray()[0]
+            train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0]
             == response_features
         )
     else:
-        assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]) is None
+        assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None
 
 
 @pytest.mark.parametrize(
@@ -111,8 +111,8 @@ def test_count_vector_featurizer_attribute_featurization(
 
     train_message = Message(sentence)
     # this is needed for a valid training example
-    train_message.set(INTENT_ATTRIBUTE, intent)
-    train_message.set(RESPONSE_ATTRIBUTE, response)
+    train_message.set(INTENT, intent)
+    train_message.set(RESPONSE, response)
 
     data = TrainingData([train_message])
 
@@ -121,19 +121,19 @@ def test_count_vector_featurizer_attribute_featurization(
 
     if intent_features:
         assert (
-            train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]).toarray()[0]
+            train_message.get(SPARSE_FEATURE_NAMES[INTENT]).toarray()[0]
             == intent_features
         )
     else:
-        assert train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) is None
+        assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None
 
     if response_features:
         assert (
-            train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).toarray()[0]
+            train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0]
             == response_features
         )
     else:
-        assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]) is None
+        assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None
 
 
 @pytest.mark.parametrize(
@@ -160,23 +160,21 @@ def test_count_vector_featurizer_shared_vocab(
 
     train_message = Message(sentence)
     # this is needed for a valid training example
-    train_message.set(INTENT_ATTRIBUTE, intent)
-    train_message.set(RESPONSE_ATTRIBUTE, response)
+    train_message.set(INTENT, intent)
+    train_message.set(RESPONSE, response)
 
     data = TrainingData([train_message])
     tk.train(data)
     ftr.train(data)
 
     assert np.all(
-        train_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()[0]
-        == text_features
+        train_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == text_features
     )
     assert np.all(
-        train_message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]).toarray()[0]
-        == intent_features
+        train_message.get(SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features
     )
     assert np.all(
-        train_message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).toarray()[0]
+        train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0]
         == response_features
     )
 
@@ -203,9 +201,7 @@ def test_count_vector_featurizer_oov_token(sentence, expected):
     test_message = Message(sentence)
     ftr.process(test_message)
 
-    assert np.all(
-        test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()[0] == expected
-    )
+    assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)
 
 
 @pytest.mark.parametrize(
@@ -235,9 +231,7 @@ def test_count_vector_featurizer_oov_words(sentence, expected):
     test_message = Message(sentence)
     ftr.process(test_message)
 
-    assert np.all(
-        test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()[0] == expected
-    )
+    assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)
 
 
 @pytest.mark.parametrize(
@@ -263,20 +257,18 @@ def test_count_vector_featurizer_using_tokens(tokens, expected):
     tokens_feature = [Token(i, 0) for i in tokens]
 
     train_message = Message("")
-    train_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature)
+    train_message.set(TOKENS_NAMES[TEXT], tokens_feature)
 
     data = TrainingData([train_message])
 
     ftr.train(data)
 
     test_message = Message("")
-    test_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature)
+    test_message.set(TOKENS_NAMES[TEXT], tokens_feature)
 
     ftr.process(test_message)
 
-    assert np.all(
-        test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()[0] == expected
-    )
+    assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)
 
 
 @pytest.mark.parametrize(
@@ -300,9 +292,7 @@ def test_count_vector_featurizer_char(sentence, expected):
     WhitespaceTokenizer().process(test_message)
     ftr.process(test_message)
 
-    assert np.all(
-        test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()[0] == expected
-    )
+    assert np.all(test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)
 
 
 def test_count_vector_featurizer_persist_load(tmpdir):
@@ -363,10 +353,10 @@ def test_count_vector_featurizer_persist_load(tmpdir):
     # check that train features and test features after loading are the same
     assert np.all(
         [
-            train_message1.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()
-            == test_message1.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray(),
-            train_message2.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()
-            == test_message2.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray(),
+            train_message1.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()
+            == test_message1.get(SPARSE_FEATURE_NAMES[TEXT]).toarray(),
+            train_message2.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()
+            == test_message2.get(SPARSE_FEATURE_NAMES[TEXT]).toarray(),
         ]
     )
 
@@ -377,8 +367,8 @@ def test_count_vectors_featurizer_train():
 
     sentence = "Hey how are you today ?"
     message = Message(sentence)
-    message.set(RESPONSE_ATTRIBUTE, sentence)
-    message.set(INTENT_ATTRIBUTE, "intent")
+    message.set(RESPONSE, sentence)
+    message.set(INTENT, "intent")
     WhitespaceTokenizer().train(TrainingData([message]))
 
     featurizer.train(TrainingData([message]), RasaNLUModelConfig())
@@ -386,19 +376,19 @@ def test_count_vectors_featurizer_train():
     expected = np.array([0, 1, 0, 0, 0])
     expected_cls = np.array([1, 1, 1, 1, 1])
 
-    vecs = message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+    vecs = message.get(SPARSE_FEATURE_NAMES[TEXT])
 
     assert (6, 5) == vecs.shape
     assert np.all(vecs.toarray()[0] == expected)
     assert np.all(vecs.toarray()[-1] == expected_cls)
 
-    vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE])
+    vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE])
 
     assert (6, 5) == vecs.shape
     assert np.all(vecs.toarray()[0] == expected)
     assert np.all(vecs.toarray()[-1] == expected_cls)
 
-    vecs = message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE])
+    vecs = message.get(SPARSE_FEATURE_NAMES[INTENT])
 
     assert (1, 1) == vecs.shape
     assert np.all(vecs.toarray()[0] == np.array([1]))
diff --git a/tests/nlu/featurizers/test_featurizer.py b/tests/nlu/featurizers/test_featurizer.py
index 995f458c14fa..7561f603eebf 100644
--- a/tests/nlu/featurizers/test_featurizer.py
+++ b/tests/nlu/featurizers/test_featurizer.py
@@ -2,15 +2,19 @@
 import pytest
 import scipy.sparse
 
-from rasa.nlu.featurizers.featurizer import Featurizer, sequence_to_sentence_features
-from rasa.nlu.constants import DENSE_FEATURE_NAMES, SPARSE_FEATURE_NAMES, TEXT_ATTRIBUTE
+from rasa.nlu.featurizers.featurizer import (
+    SparseFeaturizer,
+    DenseFeaturizer,
+    sequence_to_sentence_features,
+)
+from rasa.nlu.constants import DENSE_FEATURE_NAMES, SPARSE_FEATURE_NAMES, TEXT
 from rasa.nlu.training_data import Message
 
 
 def test_combine_with_existing_dense_features():
 
-    featurizer = Featurizer()
-    attribute = DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
+    featurizer = DenseFeaturizer()
+    attribute = DENSE_FEATURE_NAMES[TEXT]
 
     existing_features = [[1, 0, 2, 3], [2, 0, 0, 1]]
     new_features = [[1, 0], [0, 1]]
@@ -27,8 +31,8 @@ def test_combine_with_existing_dense_features():
 
 
 def test_combine_with_existing_dense_features_shape_mismatch():
-    featurizer = Featurizer()
-    attribute = DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
+    featurizer = DenseFeaturizer()
+    attribute = DENSE_FEATURE_NAMES[TEXT]
 
     existing_features = [[1, 0, 2, 3], [2, 0, 0, 1]]
     new_features = [[0, 1]]
@@ -43,9 +47,8 @@ def test_combine_with_existing_dense_features_shape_mismatch():
 
 
 def test_combine_with_existing_sparse_features():
-
-    featurizer = Featurizer()
-    attribute = SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
+    featurizer = SparseFeaturizer()
+    attribute = SPARSE_FEATURE_NAMES[TEXT]
 
     existing_features = scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]])
     new_features = scipy.sparse.csr_matrix([[1, 0], [0, 1]])
@@ -63,9 +66,8 @@ def test_combine_with_existing_sparse_features():
 
 
 def test_combine_with_existing_sparse_features_shape_mismatch():
-
-    featurizer = Featurizer()
-    attribute = SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
+    featurizer = SparseFeaturizer()
+    attribute = SPARSE_FEATURE_NAMES[TEXT]
 
     existing_features = scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]])
     new_features = scipy.sparse.csr_matrix([[0, 1]])
@@ -101,3 +103,29 @@ def test_sequence_to_sentence_features(features, expected):
         assert np.all(expected.toarray() == actual.toarray())
     else:
         assert np.all(expected == actual)
+
+
+@pytest.mark.parametrize(
+    "pooling, features, expected",
+    [
+        (
+            "mean",
+            np.array([[0.5, 3, 0.4, 0.1], [0, 0, 0, 0], [0.5, 3, 0.4, 0.1]]),
+            np.array([[0.5, 3, 0.4, 0.1]]),
+        ),
+        (
+            "max",
+            np.array([[1.0, 3.0, 0.0, 2.0], [4.0, 3.0, 1.0, 0.0]]),
+            np.array([[4.0, 3.0, 1.0, 2.0]]),
+        ),
+        (
+            "max",
+            np.array([[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]]),
+            np.array([[0.0, 0.0, 0.0, 0.0]]),
+        ),
+    ],
+)
+def test_calculate_cls_vector(pooling, features, expected):
+    actual = DenseFeaturizer._calculate_cls_vector(features, pooling)
+
+    assert np.all(actual == expected)
diff --git a/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py
new file mode 100644
index 000000000000..675b14bbda63
--- /dev/null
+++ b/tests/nlu/featurizers/test_lexical_syntactic_featurizer.py
@@ -0,0 +1,140 @@
+import numpy as np
+import pytest
+
+import scipy.sparse
+
+from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
+from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+from rasa.nlu.featurizers.sparse_featurizer.lexical_syntactic_featurizer import (
+    LexicalSyntacticFeaturizer,
+)
+from rasa.nlu.training_data import TrainingData
+from rasa.nlu.constants import TEXT, SPARSE_FEATURE_NAMES, SPACY_DOCS
+from rasa.nlu.training_data import Message
+
+
+@pytest.mark.parametrize(
+    "sentence, expected_features",
+    [
+        (
+            "hello goodbye hello",
+            [
+                [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0],
+                [0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0],
+                [1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0],
+                [1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0, 3.0, 1.0, 2.0, 1.0, 1.0, 2.0],
+            ],
+        ),
+        (
+            "a 1",
+            [
+                [0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0],
+                [1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0],
+                [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+            ],
+        ),
+    ],
+)
+def test_text_featurizer(sentence, expected_features):
+    featurizer = LexicalSyntacticFeaturizer(
+        {
+            "features": [
+                ["BOS", "upper"],
+                ["BOS", "EOS", "prefix2", "digit"],
+                ["EOS", "low"],
+            ]
+        }
+    )
+
+    train_message = Message(sentence)
+    test_message = Message(sentence)
+
+    WhitespaceTokenizer().process(train_message)
+    WhitespaceTokenizer().process(test_message)
+
+    featurizer.train(TrainingData([train_message]))
+
+    featurizer.process(test_message)
+
+    assert isinstance(
+        test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix
+    )
+
+    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()
+
+    assert np.all(actual == expected_features)
+
+
+@pytest.mark.parametrize(
+    "sentence, expected, expected_cls",
+    [
+        (
+            "hello 123 hello 123 hello",
+            [[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0]],
+            [[2.0, 2.0, 3.0, 2.0, 3.0, 2.0, 2.0]],
+        )
+    ],
+)
+def test_text_featurizer_window_size(sentence, expected, expected_cls):
+    featurizer = LexicalSyntacticFeaturizer(
+        {"features": [["upper"], ["digit"], ["low"], ["digit"]]}
+    )
+
+    train_message = Message(sentence)
+    test_message = Message(sentence)
+
+    WhitespaceTokenizer().process(train_message)
+    WhitespaceTokenizer().process(test_message)
+
+    featurizer.train(TrainingData([train_message]))
+
+    featurizer.process(test_message)
+
+    assert isinstance(
+        test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix
+    )
+
+    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()
+
+    assert np.all(actual[0] == expected)
+    assert np.all(actual[-1] == expected_cls)
+
+
+@pytest.mark.parametrize(
+    "sentence, expected",
+    [
+        (
+            "The sun is shining",
+            [
+                [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
+                [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0],
+                [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0],
+                [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0],
+                [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0],
+            ],
+        )
+    ],
+)
+def test_text_featurizer_using_pos(sentence, expected, spacy_nlp):
+    featurizer = LexicalSyntacticFeaturizer({"features": [["pos", "pos2"]]})
+
+    train_message = Message(sentence)
+    test_message = Message(sentence)
+
+    train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
+    test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
+
+    SpacyTokenizer().process(train_message)
+    SpacyTokenizer().process(test_message)
+
+    featurizer.train(TrainingData([train_message]))
+
+    featurizer.process(test_message)
+
+    assert isinstance(
+        test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix
+    )
+
+    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()
+
+    assert np.all(actual == expected)
diff --git a/tests/nlu/featurizers/test_lm_featurizer.py b/tests/nlu/featurizers/test_lm_featurizer.py
new file mode 100644
index 000000000000..f490781ad714
--- /dev/null
+++ b/tests/nlu/featurizers/test_lm_featurizer.py
@@ -0,0 +1,224 @@
+import numpy as np
+import pytest
+
+from rasa.nlu.training_data import TrainingData
+from rasa.nlu.featurizers.dense_featurizer.lm_featurizer import LanguageModelFeaturizer
+from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
+from rasa.nlu.constants import TEXT, DENSE_FEATURE_NAMES, INTENT
+from rasa.nlu.training_data import Message
+
+
+@pytest.mark.parametrize(
+    "model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec",
+    [
+        (
+            "bert",
+            ["Good evening.", "here is the sentence I want embeddings for."],
+            [(3, 768), (12, 768)],
+            [
+                [0.5727445, -0.16078179],
+                [-0.5485125, 0.09632876, -0.4278888, 0.11438395, 0.18316492],
+            ],
+            [
+                [0.068804, 0.32802248, -0.11250398, -0.11338018, -0.37116352],
+                [0.05909364, 0.06433402, 0.08569086, -0.16530034, -0.11396906],
+            ],
+        ),
+        (
+            "gpt",
+            ["Good evening.", "here is the sentence I want embeddings for."],
+            [(3, 768), (10, 768)],
+            [
+                [-0.0630323737859726, 0.4029877185821533],
+                [
+                    0.8072432279586792,
+                    -0.08990508317947388,
+                    0.9985930919647217,
+                    -0.38779014348983765,
+                    0.08921952545642853,
+                ],
+            ],
+            [
+                [
+                    0.16997766494750977,
+                    0.1493849903345108,
+                    0.39421725273132324,
+                    -0.5753618478775024,
+                    0.05096133053302765,
+                ],
+                [
+                    0.41056010127067566,
+                    -0.1169343888759613,
+                    -0.3019704818725586,
+                    -0.40207183361053467,
+                    0.6289798021316528,
+                ],
+            ],
+        ),
+        (
+            "gpt2",
+            ["Good evening.", "here is the sentence I want embeddings for."],
+            [(4, 768), (12, 768)],
+            [
+                [-0.033827826380729675, -0.10971662402153015, 0.002244209870696068],
+                [
+                    -0.18434514105319977,
+                    -0.5386468768119812,
+                    -0.11122681945562363,
+                    -1.368929147720337,
+                    -0.5397579669952393,
+                ],
+            ],
+            [
+                [
+                    -0.04710008203983307,
+                    -0.2793063223361969,
+                    -0.23804056644439697,
+                    -0.3212292492389679,
+                    0.11430201679468155,
+                ],
+                [
+                    -0.1809544414281845,
+                    -0.017152192071080208,
+                    -0.3176477551460266,
+                    -0.008387327194213867,
+                    0.3365338146686554,
+                ],
+            ],
+        ),
+        (
+            "xlnet",
+            ["Good evening.", "here is the sentence I want embeddings for."],
+            [(3, 768), (11, 768)],
+            [
+                [1.7612367868423462, 2.5819129943847656],
+                [
+                    0.784195065498352,
+                    0.7068007588386536,
+                    1.5883606672286987,
+                    1.891886591911316,
+                    2.5209126472473145,
+                ],
+            ],
+            [
+                [
+                    2.171574831008911,
+                    -1.5377449989318848,
+                    -3.2671749591827393,
+                    0.22520869970321655,
+                    -1.598855972290039,
+                ],
+                [
+                    1.6516317129135132,
+                    0.021670114248991013,
+                    -2.5114030838012695,
+                    1.447351098060608,
+                    -2.5866634845733643,
+                ],
+            ],
+        ),
+        (
+            "distilbert",
+            ["Good evening.", "here is the sentence I want embeddings for."],
+            [(3, 768), (12, 768)],
+            [
+                [0.22866562008857727, -0.0575055330991745],
+                [
+                    -0.6448041796684265,
+                    -0.5105321407318115,
+                    -0.4892978072166443,
+                    0.17531153559684753,
+                    0.22717803716659546,
+                ],
+            ],
+            [
+                [
+                    -0.09814466536045074,
+                    -0.07325993478298187,
+                    0.22358475625514984,
+                    -0.20274735987186432,
+                    -0.07363069802522659,
+                ],
+                [
+                    -0.146609365940094,
+                    -0.07373693585395813,
+                    0.016850866377353668,
+                    -0.2407529354095459,
+                    -0.0979844480752945,
+                ],
+            ],
+        ),
+        (
+            "roberta",
+            ["Good evening.", "here is the sentence I want embeddings for."],
+            [(4, 768), (12, 768)],
+            [
+                [-0.309267520904541, 0.12365783751010895, 0.06769893318414688],
+                [
+                    0.02152823843061924,
+                    -0.08026768267154694,
+                    -0.10808645188808441,
+                    0.20090824365615845,
+                    0.04756045714020729,
+                ],
+            ],
+            [
+                [
+                    -0.03930358216166496,
+                    0.034788478165864944,
+                    0.12246038764715195,
+                    0.08401528000831604,
+                    0.7026961445808411,
+                ],
+                [
+                    -0.018586941063404083,
+                    -0.09835464507341385,
+                    0.03242188319563866,
+                    0.09366855770349503,
+                    0.4458026587963104,
+                ],
+            ],
+        ),
+    ],
+)
+def test_lm_featurizer_shape_values(
+    model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec
+):
+    transformers_config = {"model_name": model_name}
+
+    transformers_nlp = HFTransformersNLP(transformers_config)
+    lm_featurizer = LanguageModelFeaturizer()
+
+    messages = []
+    for text in texts:
+        messages.append(Message.build(text=text))
+    td = TrainingData(messages)
+
+    transformers_nlp.train(td)
+    lm_featurizer.train(td)
+
+    for index in range(len(texts)):
+
+        computed_feature_vec = messages[index].get(DENSE_FEATURE_NAMES[TEXT])
+        computed_sequence_vec, computed_sentence_vec = (
+            computed_feature_vec[:-1],
+            computed_feature_vec[-1],
+        )
+
+        assert computed_feature_vec.shape == expected_shape[index]
+
+        # Look at the value of first dimension for a few starting timesteps
+        assert np.allclose(
+            computed_sequence_vec[: len(expected_sequence_vec[index]), 0],
+            expected_sequence_vec[index],
+            atol=1e-5,
+        )
+
+        # Look at the first value of first five dimensions
+        assert np.allclose(
+            computed_sentence_vec[:5], expected_cls_vec[index], atol=1e-5
+        )
+
+        intent_vec = messages[index].get(DENSE_FEATURE_NAMES[INTENT])
+
+        assert intent_vec is None
diff --git a/tests/nlu/featurizers/test_mitie_featurizer.py b/tests/nlu/featurizers/test_mitie_featurizer.py
index 95fb1f7a83b0..6a2f1757a2f5 100644
--- a/tests/nlu/featurizers/test_mitie_featurizer.py
+++ b/tests/nlu/featurizers/test_mitie_featurizer.py
@@ -2,9 +2,9 @@
 
 from rasa.nlu.constants import (
     DENSE_FEATURE_NAMES,
-    TEXT_ATTRIBUTE,
-    RESPONSE_ATTRIBUTE,
-    INTENT_ATTRIBUTE,
+    TEXT,
+    RESPONSE,
+    INTENT,
     TOKENS_NAMES,
 )
 from rasa.nlu.training_data import Message, TrainingData
@@ -20,7 +20,7 @@ def test_mitie_featurizer(mitie_feature_extractor):
     sentence = "Hey how are you today"
     message = Message(sentence)
     MitieTokenizer().process(message)
-    tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])
+    tokens = message.get(TOKENS_NAMES[TEXT])
 
     vecs = featurizer.features_for_tokens(tokens, mitie_feature_extractor)
 
@@ -40,8 +40,8 @@ def test_mitie_featurizer_train(mitie_feature_extractor):
 
     sentence = "Hey how are you today"
     message = Message(sentence)
-    message.set(RESPONSE_ATTRIBUTE, sentence)
-    message.set(INTENT_ATTRIBUTE, "intent")
+    message.set(RESPONSE, sentence)
+    message.set(INTENT, "intent")
     MitieTokenizer().train(TrainingData([message]))
 
     featurizer.train(
@@ -55,18 +55,18 @@ def test_mitie_featurizer_train(mitie_feature_extractor):
     )
     expected_cls = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751])
 
-    vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
 
-    assert len(message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])) == len(vecs)
+    assert len(message.get(TOKENS_NAMES[TEXT])) == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
     assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE])
 
-    assert len(message.get(TOKENS_NAMES[RESPONSE_ATTRIBUTE])) == len(vecs)
+    assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
     assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[INTENT])
 
     assert vecs is None
diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py
index dcc9b80e107d..6a887ea2793e 100644
--- a/tests/nlu/featurizers/test_regex_featurizer.py
+++ b/tests/nlu/featurizers/test_regex_featurizer.py
@@ -6,11 +6,11 @@
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 from rasa.nlu.constants import (
-    TEXT_ATTRIBUTE,
-    RESPONSE_ATTRIBUTE,
+    TEXT,
+    RESPONSE,
     SPACY_DOCS,
     TOKENS_NAMES,
-    INTENT_ATTRIBUTE,
+    INTENT,
     SPARSE_FEATURE_NAMES,
 )
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
@@ -80,17 +80,17 @@ def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
 
     # adds tokens to the message
     tokenizer = SpacyTokenizer({})
-    message = Message(sentence, data={RESPONSE_ATTRIBUTE: sentence})
-    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))
+    message = Message(sentence, data={RESPONSE: sentence})
+    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
     tokenizer.process(message)
 
-    result = ftr._features_for_patterns(message, TEXT_ATTRIBUTE)
+    result = ftr._features_for_patterns(message, TEXT)
     assert np.allclose(result.toarray(), expected, atol=1e-10)
 
     # the tokenizer should have added tokens
-    assert len(message.get(TOKENS_NAMES[TEXT_ATTRIBUTE], [])) > 0
+    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
     # the number of regex matches on each token should match
-    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])):
+    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
         token_matches = token.get("pattern").values()
         num_matches = sum(token_matches)
         assert num_matches == labeled_tokens.count(i)
@@ -141,16 +141,16 @@ def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
     component_config = {"name": "SpacyTokenizer"}
     tokenizer = SpacyTokenizer(component_config)
     message = Message(sentence)
-    message.set("spacy_doc", spacy_nlp(sentence))
+    message.set("text_spacy_doc", spacy_nlp(sentence))
     tokenizer.process(message)
 
-    result = ftr._features_for_patterns(message, TEXT_ATTRIBUTE)
+    result = ftr._features_for_patterns(message, TEXT)
     assert np.allclose(result.toarray(), expected, atol=1e-10)
 
     # the tokenizer should have added tokens
-    assert len(message.get(TOKENS_NAMES[TEXT_ATTRIBUTE], [])) > 0
+    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
     # the number of regex matches on each token should match
-    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])):
+    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
         token_matches = token.get("pattern").values()
         num_matches = sum(token_matches)
         assert num_matches == labeled_tokens.count(i)
@@ -177,10 +177,10 @@ def test_regex_featurizer_no_sequence(sentence, expected, expected_cls, spacy_nl
     # adds tokens to the message
     tokenizer = SpacyTokenizer()
     message = Message(sentence)
-    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))
+    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
     tokenizer.process(message)
 
-    result = ftr._features_for_patterns(message, TEXT_ATTRIBUTE)
+    result = ftr._features_for_patterns(message, TEXT)
     assert np.allclose(result.toarray()[0], expected, atol=1e-10)
     assert np.allclose(result.toarray()[-1], expected_cls, atol=1e-10)
 
@@ -197,8 +197,8 @@ def test_regex_featurizer_train():
 
     sentence = "hey how are you today 19.12.2019 ?"
     message = Message(sentence)
-    message.set(RESPONSE_ATTRIBUTE, sentence)
-    message.set(INTENT_ATTRIBUTE, "intent")
+    message.set(RESPONSE, sentence)
+    message.set(INTENT, "intent")
     WhitespaceTokenizer().train(TrainingData([message]))
 
     featurizer.train(
@@ -208,18 +208,18 @@ def test_regex_featurizer_train():
     expected = np.array([0, 1, 0])
     expected_cls = np.array([1, 1, 1])
 
-    vecs = message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+    vecs = message.get(SPARSE_FEATURE_NAMES[TEXT])
 
     assert (7, 3) == vecs.shape
     assert np.all(vecs.toarray()[0] == expected)
     assert np.all(vecs.toarray()[-1] == expected_cls)
 
-    vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE])
+    vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE])
 
     assert (7, 3) == vecs.shape
     assert np.all(vecs.toarray()[0] == expected)
     assert np.all(vecs.toarray()[-1] == expected_cls)
 
-    vecs = message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE])
+    vecs = message.get(SPARSE_FEATURE_NAMES[INTENT])
 
     assert vecs is None
diff --git a/tests/nlu/featurizers/test_spacy_featurizer.py b/tests/nlu/featurizers/test_spacy_featurizer.py
index e13acd4a0312..6d60b171b906 100644
--- a/tests/nlu/featurizers/test_spacy_featurizer.py
+++ b/tests/nlu/featurizers/test_spacy_featurizer.py
@@ -6,13 +6,7 @@
 from rasa.nlu.training_data import TrainingData
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
-from rasa.nlu.constants import (
-    SPACY_DOCS,
-    TEXT_ATTRIBUTE,
-    DENSE_FEATURE_NAMES,
-    RESPONSE_ATTRIBUTE,
-    INTENT_ATTRIBUTE,
-)
+from rasa.nlu.constants import SPACY_DOCS, TEXT, DENSE_FEATURE_NAMES, RESPONSE, INTENT
 
 
 def test_spacy_featurizer_cls_vector(spacy_nlp):
@@ -20,11 +14,11 @@ def test_spacy_featurizer_cls_vector(spacy_nlp):
 
     sentence = "Hey how are you today"
     message = Message(sentence)
-    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))
+    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
 
     featurizer._set_spacy_features(message)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
 
     expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])
     expected_cls = np.array([-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756])
@@ -105,7 +99,7 @@ def test_spacy_featurizer_sequence(sentence, expected, spacy_nlp):
     greet = {"intent": "greet", "text_features": [0.5]}
 
     message = Message(sentence, greet)
-    message.set("spacy_doc", doc)
+    message.set("text_spacy_doc", doc)
 
     ftr._set_spacy_features(message)
 
@@ -146,28 +140,28 @@ def test_spacy_featurizer_train(spacy_nlp):
 
     sentence = "Hey how are you today"
     message = Message(sentence)
-    message.set(RESPONSE_ATTRIBUTE, sentence)
-    message.set(INTENT_ATTRIBUTE, "intent")
-    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))
-    message.set(SPACY_DOCS[RESPONSE_ATTRIBUTE], spacy_nlp(sentence))
+    message.set(RESPONSE, sentence)
+    message.set(INTENT, "intent")
+    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
+    message.set(SPACY_DOCS[RESPONSE], spacy_nlp(sentence))
 
     featurizer.train(TrainingData([message]), RasaNLUModelConfig())
 
     expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])
     expected_cls = np.array([-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756])
 
-    vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
 
     assert 6 == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
     assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE])
 
     assert 6 == len(vecs)
     assert np.allclose(vecs[0][:5], expected, atol=1e-5)
     assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
 
-    vecs = message.get(DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE])
+    vecs = message.get(DENSE_FEATURE_NAMES[INTENT])
 
     assert vecs is None
diff --git a/tests/nlu/selectors/test_embedding_response_selector.py b/tests/nlu/selectors/test_selectors.py
similarity index 51%
rename from tests/nlu/selectors/test_embedding_response_selector.py
rename to tests/nlu/selectors/test_selectors.py
index 6b040ec8b74e..fbce352a6b8b 100644
--- a/tests/nlu/selectors/test_embedding_response_selector.py
+++ b/tests/nlu/selectors/test_selectors.py
@@ -1,17 +1,29 @@
+import pytest
+
+from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import load_data
-from rasa.nlu import config
 from rasa.nlu.train import Trainer, Interpreter
+from rasa.utils.tensorflow.constants import EPOCHS
 
 
-def test_train_response_selector(component_builder, tmpdir):
+@pytest.mark.parametrize(
+    "pipeline",
+    [
+        [
+            {"name": "WhitespaceTokenizer"},
+            {"name": "CountVectorsFeaturizer"},
+            {"name": "ResponseSelector", EPOCHS: 1},
+        ]
+    ],
+)
+def test_train_selector(pipeline, component_builder, tmpdir):
+    # use data that include some responses
     td = load_data("data/examples/rasa/demo-rasa.md")
     td_responses = load_data("data/examples/rasa/demo-rasa-responses.md")
     td = td.merge(td_responses)
     td.fill_response_phrases()
 
-    nlu_config = config.load(
-        "sample_configs/config_embedding_intent_response_selector.yml"
-    )
+    nlu_config = RasaNLUModelConfig({"language": "en", "pipeline": pipeline})
 
     trainer = Trainer(nlu_config)
     trainer.train(td)
@@ -19,7 +31,8 @@ def test_train_response_selector(component_builder, tmpdir):
     persisted_path = trainer.persist(tmpdir)
 
     assert trainer.pipeline
+
     loaded = Interpreter.load(persisted_path, component_builder)
+
     assert loaded.pipeline
     assert loaded.parse("hello") is not None
-    assert loaded.parse("Hello today is Monday, again!") is not None
diff --git a/tests/nlu/base/test_components.py b/tests/nlu/test_components.py
similarity index 74%
rename from tests/nlu/base/test_components.py
rename to tests/nlu/test_components.py
index 44845d7460a3..6f3e35027081 100644
--- a/tests/nlu/base/test_components.py
+++ b/tests/nlu/test_components.py
@@ -33,27 +33,24 @@ def test_all_components_in_model_templates_exist(pipeline_template):
 
 
 @pytest.mark.parametrize("component_class", registry.component_classes)
-def test_all_arguments_can_be_satisfied(component_class):
-    """Check that `train` method parameters can be filled
-    filled from the context. Similar to `pipeline_init` test."""
-
-    # All available context arguments that will ever be generated during train
-    # it might still happen, that in a certain pipeline
-    # configuration arguments can not be satisfied!
-    provided_properties = {
-        provided for c in registry.component_classes for provided in c.provides
-    }
-
-    for req in component_class.requires:
-        if isinstance(req, Tuple):
-            for r in req:
-                assert (
-                    r in provided_properties
-                ), "No component provides required property."
-        else:
-            assert (
-                req in provided_properties
-            ), "No component provides required property."
+def test_all_required_components_can_be_satisfied(component_class):
+    """Checks that all required_components are present in the registry."""
+
+    def _required_component_in_registry(component):
+        for previous_component in registry.component_classes:
+            if issubclass(previous_component, component):
+                return True
+        return False
+
+    missing_components = []
+    for required_component in component_class.required_components():
+        if not _required_component_in_registry(required_component):
+            missing_components.append(required_component.name)
+
+    assert missing_components == [], (
+        f"There is no required components {missing_components} "
+        f"for '{component_class.name}'."
+    )
 
 
 def test_find_unavailable_packages():
@@ -63,12 +60,12 @@ def test_find_unavailable_packages():
     assert unavailable == {"my_made_up_package_name", "foo_bar"}
 
 
-def test_builder_create_by_module_path(component_builder, default_config):
+def test_builder_create_by_module_path(component_builder, blank_config):
     from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 
     path = "rasa.nlu.featurizers.sparse_featurizer.regex_featurizer.RegexFeaturizer"
     component_config = {"name": path}
-    component = component_builder.create_component(component_config, default_config)
+    component = component_builder.create_component(component_config, blank_config)
     assert type(component) == RegexFeaturizer
 
 
@@ -85,12 +82,12 @@ def test_builder_create_by_module_path(component_builder, default_config):
     ],
 )
 def test_create_component_exception_messages(
-    component_builder, default_config, test_input, expected_output, error
+    component_builder, blank_config, test_input, expected_output, error
 ):
 
     with pytest.raises(error):
         component_config = {"name": test_input}
-        component_builder.create_component(component_config, default_config)
+        component_builder.create_component(component_config, blank_config)
 
 
 def test_builder_load_unknown(component_builder):
diff --git a/tests/nlu/test_config.py b/tests/nlu/test_config.py
new file mode 100644
index 000000000000..f1061b12846b
--- /dev/null
+++ b/tests/nlu/test_config.py
@@ -0,0 +1,174 @@
+import json
+import tempfile
+import os
+from typing import Text
+
+import pytest
+
+import rasa.utils.io as io_utils
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu import config
+from rasa.nlu.components import ComponentBuilder
+from rasa.nlu.registry import registered_pipeline_templates
+from rasa.nlu.model import Trainer
+from tests.nlu.utilities import write_file_config
+
+
+def test_blank_config(blank_config):
+    file_config = {}
+    f = write_file_config(file_config)
+    final_config = config.load(f.name)
+
+    assert final_config.as_dict() == blank_config.as_dict()
+
+
+def test_invalid_config_json():
+    file_config = """pipeline: [pretrained_embeddings_spacy"""  # invalid yaml
+
+    with tempfile.NamedTemporaryFile("w+", suffix="_tmp_config_file.json") as f:
+        f.write(file_config)
+        f.flush()
+
+        with pytest.raises(config.InvalidConfigError):
+            config.load(f.name)
+
+
+def test_invalid_pipeline_template():
+    args = {"pipeline": "my_made_up_name"}
+    f = write_file_config(args)
+
+    with pytest.raises(config.InvalidConfigError) as execinfo:
+        config.load(f.name)
+    assert "unknown pipeline template" in str(execinfo.value)
+
+
+def test_invalid_many_tokenizers_in_config():
+    nlu_config = {
+        "pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "SpacyTokenizer"}]
+    }
+
+    with pytest.raises(config.InvalidConfigError) as execinfo:
+        Trainer(config.RasaNLUModelConfig(nlu_config))
+    assert "More then one tokenizer is used" in str(execinfo.value)
+
+
+@pytest.mark.parametrize(
+    "_config",
+    [
+        {"pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "SpacyFeaturizer"}]},
+        {"pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "ConveRTFeaturizer"}]},
+        {
+            "pipeline": [
+                {"name": "ConveRTTokenizer"},
+                {"name": "LanguageModelFeaturizer"},
+            ]
+        },
+    ],
+)
+def test_missing_required_component(_config):
+    with pytest.raises(config.InvalidConfigError) as execinfo:
+        Trainer(config.RasaNLUModelConfig(_config))
+    assert "Add required components to the pipeline" in str(execinfo.value)
+
+
+@pytest.mark.parametrize(
+    "pipeline_config", [{"pipeline": [{"name": "CountVectorsFeaturizer"}]}]
+)
+def test_missing_property(pipeline_config):
+    with pytest.raises(config.InvalidConfigError) as execinfo:
+        Trainer(config.RasaNLUModelConfig(pipeline_config))
+    assert "Add required components to the pipeline" in str(execinfo.value)
+
+
+@pytest.mark.parametrize(
+    "pipeline_template", list(registered_pipeline_templates.keys())
+)
+def test_pipeline_registry_lookup(pipeline_template: Text):
+    args = {"pipeline": pipeline_template}
+    f = write_file_config(args)
+
+    final_config = config.load(f.name)
+    components = [c for c in final_config.pipeline]
+
+    assert json.dumps(components, sort_keys=True) == json.dumps(
+        registered_pipeline_templates[pipeline_template], sort_keys=True
+    )
+
+
+def test_default_config_file():
+    final_config = config.RasaNLUModelConfig()
+    assert len(final_config) > 1
+
+
+def test_set_attr_on_component():
+    _config = RasaNLUModelConfig(
+        {
+            "language": "en",
+            "pipeline": [
+                {"name": "SpacyNLP"},
+                {"name": "SpacyTokenizer"},
+                {"name": "SpacyFeaturizer"},
+                {"name": "DIETClassifier"},
+            ],
+        }
+    )
+    idx_classifier = _config.component_names.index("DIETClassifier")
+    idx_tokenizer = _config.component_names.index("SpacyTokenizer")
+
+    _config.set_component_attr(idx_classifier, epochs=10)
+
+    assert _config.for_component(idx_tokenizer) == {"name": "SpacyTokenizer"}
+    assert _config.for_component(idx_classifier) == {
+        "name": "DIETClassifier",
+        "epochs": 10,
+    }
+
+
+def test_override_defaults_supervised_embeddings_pipeline():
+    builder = ComponentBuilder()
+
+    _config = RasaNLUModelConfig(
+        {
+            "language": "en",
+            "pipeline": [
+                {"name": "SpacyNLP"},
+                {"name": "SpacyTokenizer"},
+                {"name": "SpacyFeaturizer", "pooling": "max"},
+                {"name": "DIETClassifier", "epochs": 10},
+            ],
+        }
+    )
+
+    idx_featurizer = _config.component_names.index("SpacyFeaturizer")
+    idx_classifier = _config.component_names.index("DIETClassifier")
+
+    component1 = builder.create_component(
+        _config.for_component(idx_featurizer), _config
+    )
+    assert component1.component_config["pooling"] == "max"
+
+    component2 = builder.create_component(
+        _config.for_component(idx_classifier), _config
+    )
+    assert component2.component_config["epochs"] == 10
+
+
+def config_files_in(config_directory: Text):
+    return [
+        os.path.join(config_directory, f)
+        for f in os.listdir(config_directory)
+        if os.path.isfile(os.path.join(config_directory, f))
+    ]
+
+
+@pytest.mark.parametrize(
+    "config_file",
+    config_files_in("data/configs_for_docs") + config_files_in("docker/configs"),
+)
+def test_train_docker_and_docs_configs(config_file: Text):
+    content = io_utils.read_yaml_file(config_file)
+
+    loaded_config = config.load(config_file)
+
+    assert len(loaded_config.component_names) > 1
+    assert loaded_config.language == content["language"]
diff --git a/tests/nlu/base/test_evaluation.py b/tests/nlu/test_evaluation.py
similarity index 94%
rename from tests/nlu/base/test_evaluation.py
rename to tests/nlu/test_evaluation.py
index 95436865d55c..7312fded6d23 100644
--- a/tests/nlu/base/test_evaluation.py
+++ b/tests/nlu/test_evaluation.py
@@ -2,14 +2,14 @@
 from typing import Text, Iterator
 
 import asyncio
-import logging
 
 import pytest
 from _pytest.tmpdir import TempdirFactory
 
 import rasa.utils.io
+from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
 from rasa.test import compare_nlu_models
-from rasa.nlu.extractors import EntityExtractor
+from rasa.nlu.extractors.extractor import EntityExtractor
 from rasa.nlu.extractors.mitie_entity_extractor import MitieEntityExtractor
 from rasa.nlu.extractors.spacy_entity_extractor import SpacyEntityExtractor
 from rasa.nlu.model import Interpreter
@@ -43,19 +43,21 @@
 from rasa.nlu.test import determine_token_labels
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.tokenizer import Token
-from rasa.nlu import utils
 import json
 import os
-from rasa.nlu import training_data, config
+from rasa.nlu import training_data
 from tests.nlu import utilities
-from tests.nlu.conftest import DEFAULT_DATA_PATH, NLU_DEFAULT_CONFIG_PATH
-from rasa.nlu.selectors.embedding_response_selector import ResponseSelector
+from tests.nlu.conftest import DEFAULT_DATA_PATH
+from rasa.nlu.selectors.response_selector import ResponseSelector
 from rasa.nlu.test import is_response_selector_present
+from rasa.utils.tensorflow.constants import EPOCHS
 
 
 # https://github.com/pytest-dev/pytest-asyncio/issues/68
 # this event_loop is used by pytest-asyncio, and redefining it
 # is currently the only way of changing the scope of this fixture
+
+
 @pytest.yield_fixture(scope="session")
 def event_loop(request: Request) -> Iterator[asyncio.AbstractEventLoop]:
     loop = asyncio.get_event_loop_policy().new_event_loop()
@@ -211,7 +213,7 @@ def test_determine_token_labels_throws_error():
         determine_token_labels(
             CH_correct_segmentation[0],
             [CH_correct_entity, CH_wrong_entity],
-            ["CRFEntityExtractor"],
+            [CRFEntityExtractor.name],
         )
 
 
@@ -269,22 +271,22 @@ def test_drop_intents_below_freq():
 
 
 def test_run_evaluation(unpacked_trained_moodbot_path):
-    data = DEFAULT_DATA_PATH
-
     result = run_evaluation(
-        data, os.path.join(unpacked_trained_moodbot_path, "nlu"), errors=False
+        DEFAULT_DATA_PATH,
+        os.path.join(unpacked_trained_moodbot_path, "nlu"),
+        errors=False,
     )
+
     assert result.get("intent_evaluation")
-    assert result.get("entity_evaluation").get("CRFEntityExtractor")
+    assert result.get("entity_evaluation").get("DIETClassifier")
 
 
-def test_run_cv_evaluation():
+def test_run_cv_evaluation(pretrained_embeddings_spacy_config):
     td = training_data.load_data("data/examples/rasa/demo-rasa.json")
-    nlu_config = config.load("sample_configs/config_pretrained_embeddings_spacy.yml")
 
     n_folds = 2
     intent_results, entity_results, response_selection_results = cross_validate(
-        td, n_folds, nlu_config
+        td, n_folds, pretrained_embeddings_spacy_config
     )
 
     assert len(intent_results.train["Accuracy"]) == n_folds
@@ -309,8 +311,16 @@ def test_run_cv_evaluation_with_response_selector():
     training_data_obj = training_data_obj.merge(training_data_responses_obj)
     training_data_obj.fill_response_phrases()
 
-    nlu_config = config.load(
-        "sample_configs/config_embedding_intent_response_selector.yml"
+    nlu_config = RasaNLUModelConfig(
+        {
+            "language": "en",
+            "pipeline": [
+                {"name": "WhitespaceTokenizer"},
+                {"name": "CountVectorsFeaturizer"},
+                {"name": "DIETClassifier", EPOCHS: 2},
+                {"name": "ResponseSelector", EPOCHS: 2},
+            ],
+        }
     )
 
     n_folds = 2
@@ -330,9 +340,12 @@ def test_run_cv_evaluation_with_response_selector():
     assert len(response_selection_results.test["Accuracy"]) == n_folds
     assert len(response_selection_results.test["Precision"]) == n_folds
     assert len(response_selection_results.test["F1-score"]) == n_folds
-    # No entity extractor in pipeline
-    assert len(entity_results.train) == 0
-    assert len(entity_results.test) == 0
+    assert len(entity_results.train["DIETClassifier"]["Accuracy"]) == n_folds
+    assert len(entity_results.train["DIETClassifier"]["Precision"]) == n_folds
+    assert len(entity_results.train["DIETClassifier"]["F1-score"]) == n_folds
+    assert len(entity_results.test["DIETClassifier"]["Accuracy"]) == n_folds
+    assert len(entity_results.test["DIETClassifier"]["Precision"]) == n_folds
+    assert len(entity_results.test["DIETClassifier"]["F1-score"]) == n_folds
 
 
 def test_response_selector_present():
@@ -714,14 +727,12 @@ def test_get_evaluation_metrics(
     assert NO_ENTITY not in report
 
 
-def test_nlu_comparison(tmpdir):
-    configs = [
-        NLU_DEFAULT_CONFIG_PATH,
-        "sample_configs/config_embedding_intent_response_selector.yml",
-    ]
+def test_nlu_comparison(tmpdir, config_path):
+    configs = [config_path, config_path]
+
     output = tmpdir.strpath
     compare_nlu_models(
-        configs, DEFAULT_DATA_PATH, output, runs=2, exclusion_percentages=[50, 80],
+        configs, DEFAULT_DATA_PATH, output, runs=2, exclusion_percentages=[50, 80]
     )
 
     assert set(os.listdir(output)) == {
diff --git a/tests/nlu/base/test_interpreter.py b/tests/nlu/test_interpreter.py
similarity index 91%
rename from tests/nlu/base/test_interpreter.py
rename to tests/nlu/test_interpreter.py
index f83807d4c0c7..0960524a9c4e 100644
--- a/tests/nlu/base/test_interpreter.py
+++ b/tests/nlu/test_interpreter.py
@@ -18,13 +18,18 @@
 @pytest.mark.parametrize(
     "pipeline_template", list(registry.registered_pipeline_templates.keys())
 )
-async def test_interpreter(pipeline_template, component_builder, tmpdir):
+async def test_interpreter_on_pipeline_templates(
+    pipeline_template, component_builder, tmpdir
+):
     test_data = "data/examples/rasa/demo-rasa.json"
-    _conf = utilities.base_test_conf(pipeline_template)
-    _conf["data"] = test_data
+
+    config = utilities.base_test_conf(pipeline_template)
+    config["data"] = test_data
+
     td = training_data.load_data(test_data)
+
     interpreter = await utilities.interpreter_for(
-        component_builder, "data/examples/rasa/demo-rasa.json", tmpdir.strpath, _conf
+        component_builder, "data/examples/rasa/demo-rasa.json", tmpdir.strpath, config
     )
 
     texts = ["good bye", "i am looking for an indian spot"]
@@ -60,9 +65,10 @@ async def test_interpreter(pipeline_template, component_builder, tmpdir):
         {"rasa_version": "0.14.4"},
         {"rasa_version": "0.15.0a1"},
         {"rasa_version": "1.0.0a1"},
+        {"rasa_version": "1.5.0"},
     ],
 )
-def test_model_not_compatible(metadata):
+def test_model_is_not_compatible(metadata):
     with pytest.raises(rasa.nlu.model.UnsupportedModelError):
         Interpreter.ensure_model_compatibility(metadata)
 
diff --git a/tests/nlu/base/test_persistor.py b/tests/nlu/test_persistor.py
similarity index 88%
rename from tests/nlu/base/test_persistor.py
rename to tests/nlu/test_persistor.py
index 8371060a37bd..3dba8e43b617 100644
--- a/tests/nlu/base/test_persistor.py
+++ b/tests/nlu/test_persistor.py
@@ -14,7 +14,7 @@ class Object:
 
 # noinspection PyPep8Naming
 @mock_s3
-async def test_list_method_method_in_AWSPersistor(component_builder, tmpdir):
+async def test_list_method_method_in_AWS_persistor(component_builder, tmpdir):
     # artificially create a persisted model
     _config = utilities.base_test_conf("keyword")
     os.environ["BUCKET_NAME"] = "rasa-test"
@@ -38,10 +38,10 @@ async def test_list_method_method_in_AWSPersistor(component_builder, tmpdir):
 
 # noinspection PyPep8Naming
 @mock_s3
-def test_list_models_method_raise_exeception_in_AWSPersistor():
+def test_list_models_method_raise_exeception_in_AWS_persistor():
     os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
 
-    awspersistor = persistor.AWSPersistor("rasa-test")
+    awspersistor = persistor.AWSPersistor("rasa-test", region_name="foo")
     result = awspersistor.list_models()
 
     assert result == []
@@ -54,7 +54,9 @@ def test_retrieve_tar_archive_with_s3_namespace():
     destination = "dst"
     with patch.object(persistor.AWSPersistor, "_decompress") as decompress:
         with patch.object(persistor.AWSPersistor, "_retrieve_tar") as retrieve:
-            persistor.AWSPersistor("rasa-test").retrieve(model, destination)
+            persistor.AWSPersistor("rasa-test", region_name="foo").retrieve(
+                model, destination
+            )
         decompress.assert_called_once_with("model.tar.gz", destination)
         retrieve.assert_called_once_with(model)
 
@@ -65,7 +67,7 @@ def test_s3_private_retrieve_tar():
     # Ensure the S3 persistor writes to a filename `model.tar.gz`, whilst
     # passing the fully namespaced path to boto3
     model = "/my/s3/project/model.tar.gz"
-    awsPersistor = persistor.AWSPersistor("rasa-test")
+    awsPersistor = persistor.AWSPersistor("rasa-test", region_name="foo")
     with patch.object(awsPersistor.bucket, "download_fileobj") as download_fileobj:
         # noinspection PyProtectedMember
         awsPersistor._retrieve_tar(model)
@@ -75,7 +77,7 @@ def test_s3_private_retrieve_tar():
 
 
 # noinspection PyPep8Naming
-def test_list_models_method_in_GCSPersistor():
+def test_list_models_method_in_GCS_persistor():
     # noinspection PyUnusedLocal
     def mocked_init(self, *args, **kwargs):
         self._model_dir_and_model_from_filename = lambda x: {
@@ -97,7 +99,7 @@ def mocked_list_blobs():
 
 
 # noinspection PyPep8Naming
-def test_list_models_method_raise_exeception_in_GCSPersistor():
+def test_list_models_method_raise_exeception_in_GCS_persistor():
     # noinspection PyUnusedLocal
     def mocked_init(self, *args, **kwargs):
         self._model_dir_and_model_from_filename = lambda x: {
@@ -117,7 +119,7 @@ def mocked_list_blobs():
 
 
 # noinspection PyPep8Naming
-def test_list_models_method_in_AzurePersistor():
+def test_list_models_method_in_Azure_persistor():
     # noinspection PyUnusedLocal
     def mocked_init(self, *args, **kwargs):
         self._model_dir_and_model_from_filename = lambda x: {
@@ -141,7 +143,7 @@ def mocked_list_blobs(container_name, prefix=None):
 
 
 # noinspection PyPep8Naming
-def test_list_models_method_raise_exeception_in_AzurePersistor():
+def test_list_models_method_raise_exeception_in_Azure_persistor():
     def mocked_init(self, *args, **kwargs):
         self._model_dir_and_model_from_filename = lambda x: {"blob_name": ("project",)}[
             x
diff --git a/tests/nlu/training/test_train.py b/tests/nlu/test_train.py
similarity index 51%
rename from tests/nlu/training/test_train.py
rename to tests/nlu/test_train.py
index 6085b4451099..7e1576d667b2 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/test_train.py
@@ -4,14 +4,13 @@
 from rasa.nlu import registry, train
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Interpreter, Trainer
-from rasa.nlu.train import create_persistor
 from rasa.nlu.training_data import TrainingData
-from tests.nlu import utilities
+from rasa.utils.tensorflow.constants import EPOCHS
 from tests.nlu.conftest import DEFAULT_DATA_PATH
 
 
 def as_pipeline(*components):
-    return [{"name": c} for c in components]
+    return [{"name": c, EPOCHS: 1} for c in components]
 
 
 def pipelines_for_tests():
@@ -20,43 +19,58 @@ def pipelines_for_tests():
     # tested they still need to be in a useful order - hence we can not simply
     # generate this automatically.
 
+    # Create separate test pipelines for dense featurizers
+    # because they can't co-exist in the same pipeline together,
+    # as their tokenizers break the incoming message into different number of tokens.
+
     # first is language followed by list of components
     return [
+        ("en", as_pipeline("KeywordIntentClassifier")),
         (
             "en",
             as_pipeline(
-                "SpacyNLP",
-                "MitieNLP",
                 "WhitespaceTokenizer",
-                "ConveRTTokenizer",
-                "MitieTokenizer",
-                "SpacyTokenizer",
-                "MitieFeaturizer",
-                "SpacyFeaturizer",
                 "RegexFeaturizer",
+                "LexicalSyntacticFeaturizer",
                 "CountVectorsFeaturizer",
-                "ConveRTFeaturizer",
-                "MitieEntityExtractor",
                 "CRFEntityExtractor",
-                "SpacyEntityExtractor",
                 "DucklingHTTPExtractor",
-                "EntitySynonymMapper",
-                "SklearnIntentClassifier",
-                "MitieIntentClassifier",
+                "DIETClassifier",
                 "EmbeddingIntentClassifier",
-                "KeywordIntentClassifier",
                 "ResponseSelector",
+                "EntitySynonymMapper",
             ),
         ),
         (
-            "zh",
+            "en",
             as_pipeline(
-                "MitieNLP",
-                "JiebaTokenizer",
-                "MitieFeaturizer",
-                "MitieEntityExtractor",
+                "SpacyNLP",
+                "SpacyTokenizer",
+                "SpacyFeaturizer",
+                "SpacyEntityExtractor",
                 "SklearnIntentClassifier",
-                "KeywordIntentClassifier",
+            ),
+        ),
+        (
+            "en",
+            as_pipeline(
+                "HFTransformersNLP",
+                "LanguageModelTokenizer",
+                "LanguageModelFeaturizer",
+                "DIETClassifier",
+            ),
+        ),
+        ("en", as_pipeline("ConveRTTokenizer", "ConveRTFeaturizer", "DIETClassifier")),
+        (
+            "en",
+            as_pipeline(
+                "MitieNLP", "MitieTokenizer", "MitieFeaturizer", "MitieIntentClassifier"
+            ),
+        ),
+        (
+            "zh",
+            as_pipeline(
+                "MitieNLP", "JiebaTokenizer", "MitieFeaturizer", "MitieEntityExtractor"
             ),
         ),
     ]
@@ -65,97 +79,65 @@ def pipelines_for_tests():
 def test_all_components_are_in_at_least_one_test_pipeline():
     """There is a template that includes all components to
     test the train-persist-load-use cycle. Ensures that
-    really all Components are in there."""
+    really all components are in there."""
 
     all_components = [c["name"] for _, p in pipelines_for_tests() for c in p]
+
     for cls in registry.component_classes:
         assert (
             cls.name in all_components
         ), "`all_components` template is missing component."
 
 
-@pytest.mark.parametrize(
-    "pipeline_template", list(registry.registered_pipeline_templates.keys())
-)
-async def test_train_model(pipeline_template, component_builder, tmpdir):
-    _config = utilities.base_test_conf(pipeline_template)
+@pytest.mark.parametrize("language, pipeline", pipelines_for_tests())
+async def test_train_persist_load_parse(language, pipeline, component_builder, tmpdir):
+    _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})
+
     (trained, _, persisted_path) = await train(
         _config,
         path=tmpdir.strpath,
         data=DEFAULT_DATA_PATH,
         component_builder=component_builder,
     )
-    assert trained.pipeline
-    loaded = Interpreter.load(persisted_path, component_builder)
-    assert loaded.pipeline
-    assert loaded.parse("hello") is not None
-    assert loaded.parse("Hello today is Monday, again!") is not None
 
+    assert trained.pipeline
 
-async def test_random_seed(component_builder, tmpdir):
-    """test if train result is the same for two runs of tf embedding"""
+    loaded = Interpreter.load(persisted_path, component_builder)
 
-    _config = utilities.base_test_conf("supervised_embeddings")
-    # set fixed random seed of the embedding intent classifier to 1
-    _config.set_component_attr(6, random_seed=1)
-    # first run
-    (trained_a, _, persisted_path_a) = await train(
-        _config,
-        path=tmpdir.strpath + "_a",
-        data=DEFAULT_DATA_PATH,
-        component_builder=component_builder,
-    )
-    # second run
-    (trained_b, _, persisted_path_b) = await train(
-        _config,
-        path=tmpdir.strpath + "_b",
-        data=DEFAULT_DATA_PATH,
-        component_builder=component_builder,
-    )
-    loaded_a = Interpreter.load(persisted_path_a, component_builder)
-    loaded_b = Interpreter.load(persisted_path_b, component_builder)
-    result_a = loaded_a.parse("hello")["intent"]["confidence"]
-    result_b = loaded_b.parse("hello")["intent"]["confidence"]
-    assert result_a == result_b
+    assert loaded.pipeline
+    assert loaded.parse("Rasa is great!") is not None
 
 
 @pytest.mark.parametrize("language, pipeline", pipelines_for_tests())
-async def test_train_model_on_test_pipelines(
-    language, pipeline, component_builder, tmpdir
-):
+def test_train_model_without_data(language, pipeline, component_builder, tmpdir):
     _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})
-    (trained, _, persisted_path) = await train(
-        _config,
-        path=tmpdir.strpath,
-        data=DEFAULT_DATA_PATH,
-        component_builder=component_builder,
-    )
-    assert trained.pipeline
+
+    trainer = Trainer(_config, component_builder)
+    trainer.train(TrainingData())
+    persisted_path = trainer.persist(tmpdir.strpath)
+
     loaded = Interpreter.load(persisted_path, component_builder)
+
     assert loaded.pipeline
-    assert loaded.parse("hello") is not None
-    assert loaded.parse("Hello today is Monday, again!") is not None
+    assert loaded.parse("Rasa is great!") is not None
 
 
 @pytest.mark.parametrize("language, pipeline", pipelines_for_tests())
-async def test_train_model_no_events(language, pipeline, component_builder, tmpdir):
+def test_load_and_persist_without_train(language, pipeline, component_builder, tmpdir):
     _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})
-    (trained, _, persisted_path) = await train(
-        _config,
-        path=tmpdir.strpath,
-        data="./data/test/demo-rasa-noents.json",
-        component_builder=component_builder,
-    )
-    assert trained.pipeline
+
+    trainer = Trainer(_config, component_builder)
+    persisted_path = trainer.persist(tmpdir.strpath)
+
     loaded = Interpreter.load(persisted_path, component_builder)
+
     assert loaded.pipeline
-    assert loaded.parse("hello") is not None
-    assert loaded.parse("Hello today is Monday, again!") is not None
+    assert loaded.parse("Rasa is great!") is not None
 
 
 async def test_train_model_empty_pipeline(component_builder):
-    # Should return an empty pipeline
-    _config = utilities.base_test_conf(pipeline_template=None)
+    _config = RasaNLUModelConfig({"pipeline": None, "language": "en"})
+
     with pytest.raises(ValueError):
         await train(
             _config, data=DEFAULT_DATA_PATH, component_builder=component_builder
@@ -163,79 +145,69 @@ async def test_train_model_empty_pipeline(component_builder):
 
 
 async def test_train_named_model(component_builder, tmpdir):
-    _config = utilities.base_test_conf("keyword")
+    _config = RasaNLUModelConfig({"pipeline": "keyword", "language": "en"})
+
     (trained, _, persisted_path) = await train(
         _config,
         path=tmpdir.strpath,
         data=DEFAULT_DATA_PATH,
         component_builder=component_builder,
     )
+
     assert trained.pipeline
+
     normalized_path = os.path.dirname(os.path.normpath(persisted_path))
     # should be saved in a dir named after a project
     assert normalized_path == tmpdir.strpath
 
 
-async def test_handles_pipeline_with_non_existing_component(component_builder):
-    _config = utilities.base_test_conf("pretrained_embeddings_spacy")
-    _config.pipeline.append({"name": "my_made_up_component"})
+async def test_handles_pipeline_with_non_existing_component(
+    component_builder, pretrained_embeddings_spacy_config
+):
+    pretrained_embeddings_spacy_config.pipeline.append({"name": "my_made_up_component"})
+
     with pytest.raises(Exception) as execinfo:
         await train(
-            _config, data=DEFAULT_DATA_PATH, component_builder=component_builder
+            pretrained_embeddings_spacy_config,
+            data=DEFAULT_DATA_PATH,
+            component_builder=component_builder,
         )
     assert "Cannot find class" in str(execinfo.value)
 
 
-@pytest.mark.parametrize("language, pipeline", pipelines_for_tests())
-def test_load_and_persist_without_train(language, pipeline, component_builder, tmpdir):
-    _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})
-    trainer = Trainer(_config, component_builder)
-    persistor = create_persistor(_config)
-    persisted_path = trainer.persist(tmpdir.strpath, persistor)
-    loaded = Interpreter.load(persisted_path, component_builder)
-    assert loaded.pipeline
-    assert loaded.parse("hello") is not None
-    assert loaded.parse("Hello today is Monday, again!") is not None
-
-
-@pytest.mark.parametrize("language, pipeline", pipelines_for_tests())
-def test_train_with_empty_data(language, pipeline, component_builder, tmpdir):
-    _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})
-    trainer = Trainer(_config, component_builder)
-    trainer.train(TrainingData())
-    persistor = create_persistor(_config)
-    persisted_path = trainer.persist(tmpdir.strpath, persistor)
-    loaded = Interpreter.load(persisted_path, component_builder)
-    assert loaded.pipeline
-    assert loaded.parse("hello") is not None
-    assert loaded.parse("Hello today is Monday, again!") is not None
-
+async def test_train_model_training_data_persisted(component_builder, tmpdir):
+    _config = RasaNLUModelConfig({"pipeline": "keyword", "language": "en"})
 
-async def test_train_model_no_training_data_persisted(component_builder, tmpdir):
-    _config = utilities.base_test_conf("keyword")
     (trained, _, persisted_path) = await train(
         _config,
         path=tmpdir.strpath,
         data=DEFAULT_DATA_PATH,
         component_builder=component_builder,
-        persist_nlu_training_data=False,
+        persist_nlu_training_data=True,
     )
+
     assert trained.pipeline
+
     loaded = Interpreter.load(persisted_path, component_builder)
+
     assert loaded.pipeline
-    assert loaded.model_metadata.get("training_data") is None
+    assert loaded.model_metadata.get("training_data") is not None
 
 
-async def test_train_model_training_data_persisted(component_builder, tmpdir):
-    _config = utilities.base_test_conf("keyword")
+async def test_train_model_no_training_data_persisted(component_builder, tmpdir):
+    _config = RasaNLUModelConfig({"pipeline": "keyword", "language": "en"})
+
     (trained, _, persisted_path) = await train(
         _config,
         path=tmpdir.strpath,
         data=DEFAULT_DATA_PATH,
         component_builder=component_builder,
-        persist_nlu_training_data=True,
+        persist_nlu_training_data=False,
     )
+
     assert trained.pipeline
+
     loaded = Interpreter.load(persisted_path, component_builder)
+
     assert loaded.pipeline
-    assert loaded.model_metadata.get("training_data") is not None
+    assert loaded.model_metadata.get("training_data") is None
diff --git a/tests/nlu/base/test_utils.py b/tests/nlu/test_utils.py
similarity index 100%
rename from tests/nlu/base/test_utils.py
rename to tests/nlu/test_utils.py
diff --git a/tests/nlu/tokenizers/test_convert_tokenizer.py b/tests/nlu/tokenizers/test_convert_tokenizer.py
index 30c052c3e781..06cd3a9e7bad 100644
--- a/tests/nlu/tokenizers/test_convert_tokenizer.py
+++ b/tests/nlu/tokenizers/test_convert_tokenizer.py
@@ -1,7 +1,7 @@
 import pytest
 
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import TEXT_ATTRIBUTE, INTENT_ATTRIBUTE, TOKENS_NAMES
+from rasa.nlu.constants import TEXT, INTENT, TOKENS_NAMES
 from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
 
 
@@ -27,7 +27,7 @@
 def test_convert_tokenizer_edge_cases(text, expected_tokens, expected_indices):
     tk = ConveRTTokenizer()
 
-    tokens = tk.tokenize(Message(text), attribute=TEXT_ATTRIBUTE)
+    tokens = tk.tokenize(Message(text), attribute=TEXT)
 
     assert [t.text for t in tokens] == expected_tokens
     assert [t.start for t in tokens] == [i[0] for i in expected_indices]
@@ -47,10 +47,8 @@ def test_custom_intent_symbol(text, expected_tokens):
     tk = ConveRTTokenizer(component_config)
 
     message = Message(text)
-    message.set(INTENT_ATTRIBUTE, text)
+    message.set(INTENT, text)
 
     tk.train(TrainingData([message]))
 
-    assert [
-        t.text for t in message.get(TOKENS_NAMES[INTENT_ATTRIBUTE])
-    ] == expected_tokens
+    assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
diff --git a/tests/nlu/tokenizers/test_jieba_tokenizer.py b/tests/nlu/tokenizers/test_jieba_tokenizer.py
index 428e4b7cd350..426215541587 100644
--- a/tests/nlu/tokenizers/test_jieba_tokenizer.py
+++ b/tests/nlu/tokenizers/test_jieba_tokenizer.py
@@ -5,7 +5,7 @@
 import pytest
 
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import TEXT_ATTRIBUTE, INTENT_ATTRIBUTE, TOKENS_NAMES
+from rasa.nlu.constants import TEXT, INTENT, TOKENS_NAMES
 
 
 @pytest.mark.parametrize(
@@ -26,7 +26,7 @@
 def test_jieba(text, expected_tokens, expected_indices):
     tk = JiebaTokenizer()
 
-    tokens = tk.tokenize(Message(text), attribute=TEXT_ATTRIBUTE)
+    tokens = tk.tokenize(Message(text), attribute=TEXT)
 
     assert [t.text for t in tokens] == expected_tokens
     assert [t.start for t in tokens] == [i[0] for i in expected_indices]
@@ -42,7 +42,7 @@ def test_jieba_load_dictionary(tmpdir_factory):
         JiebaTokenizer, "load_custom_dictionary", return_value=None
     ) as mock_method:
         tk = JiebaTokenizer(component_config)
-        tk.tokenize(Message(""), attribute=TEXT_ATTRIBUTE)
+        tk.tokenize(Message(""), attribute=TEXT)
 
     mock_method.assert_called_once_with(dictionary_path)
 
@@ -60,10 +60,8 @@ def test_custom_intent_symbol(text, expected_tokens):
     tk = JiebaTokenizer(component_config)
 
     message = Message(text)
-    message.set(INTENT_ATTRIBUTE, text)
+    message.set(INTENT, text)
 
     tk.train(TrainingData([message]))
 
-    assert [
-        t.text for t in message.get(TOKENS_NAMES[INTENT_ATTRIBUTE])
-    ] == expected_tokens
+    assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
diff --git a/tests/nlu/tokenizers/test_lm_tokenizer.py b/tests/nlu/tokenizers/test_lm_tokenizer.py
new file mode 100644
index 000000000000..435c3341a2aa
--- /dev/null
+++ b/tests/nlu/tokenizers/test_lm_tokenizer.py
@@ -0,0 +1,340 @@
+import pytest
+
+from rasa.nlu.training_data import Message, TrainingData
+from rasa.nlu.constants import TEXT, INTENT, TOKENS_NAMES
+from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer
+from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
+
+
+@pytest.mark.parametrize(
+    "model_name, texts, expected_tokens, expected_indices",
+    [
+        (
+            "bert",
+            [
+                "Good evening.",
+                "you're",
+                "r. n. b.",
+                "rock & roll",
+                "here is the sentence I want embeddings for.",
+            ],
+            [
+                ["good", "evening"],
+                ["you", "re"],
+                ["r", "n", "b"],
+                ["rock", "&", "roll"],
+                [
+                    "here",
+                    "is",
+                    "the",
+                    "sentence",
+                    "i",
+                    "want",
+                    "em",
+                    "bed",
+                    "ding",
+                    "s",
+                    "for",
+                ],
+            ],
+            [
+                [(0, 4), (5, 12)],
+                [(0, 3), (4, 6)],
+                [(0, 1), (3, 4), (6, 7)],
+                [(0, 4), (5, 6), (7, 11)],
+                [
+                    (0, 4),
+                    (5, 7),
+                    (8, 11),
+                    (12, 20),
+                    (21, 22),
+                    (23, 27),
+                    (28, 30),
+                    (30, 33),
+                    (33, 37),
+                    (37, 38),
+                    (39, 42),
+                ],
+            ],
+        ),
+        (
+            "gpt",
+            [
+                "Good evening.",
+                "hello",
+                "you're",
+                "r. n. b.",
+                "rock & roll",
+                "here is the sentence I want embeddings for.",
+            ],
+            [
+                ["good", "evening"],
+                ["hello"],
+                ["you", "re"],
+                ["r", "n", "b"],
+                ["rock", "&", "roll"],
+                ["here", "is", "the", "sentence", "i", "want", "embe", "ddings", "for"],
+            ],
+            [
+                [(0, 4), (5, 12)],
+                [(0, 5)],
+                [(0, 3), (4, 6)],
+                [(0, 1), (3, 4), (6, 7)],
+                [(0, 4), (5, 6), (7, 11)],
+                [
+                    (0, 4),
+                    (5, 7),
+                    (8, 11),
+                    (12, 20),
+                    (21, 22),
+                    (23, 27),
+                    (28, 32),
+                    (32, 38),
+                    (39, 42),
+                ],
+            ],
+        ),
+        (
+            "gpt2",
+            [
+                "Good evening.",
+                "hello",
+                "you're",
+                "r. n. b.",
+                "rock & roll",
+                "here is the sentence I want embeddings for.",
+            ],
+            [
+                ["Good", "even", "ing"],
+                ["hello"],
+                ["you", "re"],
+                ["r", "n", "b"],
+                ["rock", "&", "roll"],
+                [
+                    "here",
+                    "is",
+                    "the",
+                    "sent",
+                    "ence",
+                    "I",
+                    "want",
+                    "embed",
+                    "d",
+                    "ings",
+                    "for",
+                ],
+            ],
+            [
+                [(0, 4), (5, 9), (9, 12)],
+                [(0, 5)],
+                [(0, 3), (4, 6)],
+                [(0, 1), (3, 4), (6, 7)],
+                [(0, 4), (5, 6), (7, 11)],
+                [
+                    (0, 4),
+                    (5, 7),
+                    (8, 11),
+                    (12, 16),
+                    (16, 20),
+                    (21, 22),
+                    (23, 27),
+                    (28, 33),
+                    (33, 34),
+                    (34, 38),
+                    (39, 42),
+                ],
+            ],
+        ),
+        (
+            "xlnet",
+            [
+                "Good evening.",
+                "hello",
+                "you're",
+                "r. n. b.",
+                "rock & roll",
+                "here is the sentence I want embeddings for.",
+            ],
+            [
+                ["Good", "evening"],
+                ["hello"],
+                ["you", "re"],
+                ["r", "n", "b"],
+                ["rock", "&", "roll"],
+                [
+                    "here",
+                    "is",
+                    "the",
+                    "sentence",
+                    "I",
+                    "want",
+                    "embed",
+                    "ding",
+                    "s",
+                    "for",
+                ],
+            ],
+            [
+                [(0, 4), (5, 12)],
+                [(0, 5)],
+                [(0, 3), (4, 6)],
+                [(0, 1), (3, 4), (6, 7)],
+                [(0, 4), (5, 6), (7, 11)],
+                [
+                    (0, 4),
+                    (5, 7),
+                    (8, 11),
+                    (12, 20),
+                    (21, 22),
+                    (23, 27),
+                    (28, 33),
+                    (33, 37),
+                    (37, 38),
+                    (39, 42),
+                ],
+            ],
+        ),
+        (
+            "distilbert",
+            [
+                "Good evening.",
+                "you're",
+                "r. n. b.",
+                "rock & roll",
+                "here is the sentence I want embeddings for.",
+            ],
+            [
+                ["good", "evening"],
+                ["you", "re"],
+                ["r", "n", "b"],
+                ["rock", "&", "roll"],
+                [
+                    "here",
+                    "is",
+                    "the",
+                    "sentence",
+                    "i",
+                    "want",
+                    "em",
+                    "bed",
+                    "ding",
+                    "s",
+                    "for",
+                ],
+            ],
+            [
+                [(0, 4), (5, 12)],
+                [(0, 3), (4, 6)],
+                [(0, 1), (3, 4), (6, 7)],
+                [(0, 4), (5, 6), (7, 11)],
+                [
+                    (0, 4),
+                    (5, 7),
+                    (8, 11),
+                    (12, 20),
+                    (21, 22),
+                    (23, 27),
+                    (28, 30),
+                    (30, 33),
+                    (33, 37),
+                    (37, 38),
+                    (39, 42),
+                ],
+            ],
+        ),
+        (
+            "roberta",
+            [
+                "Good evening.",
+                "hello",
+                "you're",
+                "r. n. b.",
+                "rock & roll",
+                "here is the sentence I want embeddings for.",
+            ],
+            [
+                ["Good", "even", "ing"],
+                ["hello"],
+                ["you", "re"],
+                ["r", "n", "b"],
+                ["rock", "&", "roll"],
+                [
+                    "here",
+                    "is",
+                    "the",
+                    "sent",
+                    "ence",
+                    "I",
+                    "want",
+                    "embed",
+                    "d",
+                    "ings",
+                    "for",
+                ],
+            ],
+            [
+                [(0, 4), (5, 9), (9, 12)],
+                [(0, 5)],
+                [(0, 3), (4, 6)],
+                [(0, 1), (3, 4), (6, 7)],
+                [(0, 4), (5, 6), (7, 11)],
+                [
+                    (0, 4),
+                    (5, 7),
+                    (8, 11),
+                    (12, 16),
+                    (16, 20),
+                    (21, 22),
+                    (23, 27),
+                    (28, 33),
+                    (33, 34),
+                    (34, 38),
+                    (39, 42),
+                ],
+            ],
+        ),
+    ],
+)
+def test_lm_tokenizer_edge_cases(model_name, texts, expected_tokens, expected_indices):
+
+    transformers_config = {"model_name": model_name}
+
+    transformers_nlp = HFTransformersNLP(transformers_config)
+    lm_tokenizer = LanguageModelTokenizer()
+
+    for text, gt_tokens, gt_indices in zip(texts, expected_tokens, expected_indices):
+
+        message = Message.build(text=text)
+        transformers_nlp.process(message)
+        tokens = lm_tokenizer.tokenize(message, TEXT)
+
+        assert [t.text for t in tokens] == gt_tokens
+        assert [t.start for t in tokens] == [i[0] for i in gt_indices]
+        assert [t.end for t in tokens] == [i[1] for i in gt_indices]
+
+
+@pytest.mark.parametrize(
+    "text, expected_tokens",
+    [
+        ("Forecast_for_LUNCH", ["Forecast_for_LUNCH"]),
+        ("Forecast for LUNCH", ["Forecast for LUNCH"]),
+        ("Forecast+for+LUNCH", ["Forecast", "for", "LUNCH"]),
+    ],
+)
+def test_lm_tokenizer_custom_intent_symbol(text, expected_tokens):
+    component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"}
+
+    transformers_config = {"model_name": "bert"}  # Test for one should be enough
+
+    transformers_nlp = HFTransformersNLP(transformers_config)
+    lm_tokenizer = LanguageModelTokenizer(component_config)
+
+    message = Message(text)
+    message.set(INTENT, text)
+
+    td = TrainingData([message])
+
+    transformers_nlp.train(td)
+    lm_tokenizer.train(td)
+
+    assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
diff --git a/tests/nlu/tokenizers/test_mitie_tokenizer.py b/tests/nlu/tokenizers/test_mitie_tokenizer.py
index 647290990b48..ebf40b0f9415 100644
--- a/tests/nlu/tokenizers/test_mitie_tokenizer.py
+++ b/tests/nlu/tokenizers/test_mitie_tokenizer.py
@@ -1,7 +1,7 @@
 import pytest
 
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import TEXT_ATTRIBUTE, INTENT_ATTRIBUTE, TOKENS_NAMES
+from rasa.nlu.constants import TEXT, INTENT, TOKENS_NAMES
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 
 
@@ -23,7 +23,7 @@
 def test_mitie(text, expected_tokens, expected_indices):
     tk = MitieTokenizer()
 
-    tokens = tk.tokenize(Message(text), attribute=TEXT_ATTRIBUTE)
+    tokens = tk.tokenize(Message(text), attribute=TEXT)
 
     assert [t.text for t in tokens] == expected_tokens
     assert [t.start for t in tokens] == [i[0] for i in expected_indices]
@@ -43,10 +43,8 @@ def test_custom_intent_symbol(text, expected_tokens):
     tk = MitieTokenizer(component_config)
 
     message = Message(text)
-    message.set(INTENT_ATTRIBUTE, text)
+    message.set(INTENT, text)
 
     tk.train(TrainingData([message]))
 
-    assert [
-        t.text for t in message.get(TOKENS_NAMES[INTENT_ATTRIBUTE])
-    ] == expected_tokens
+    assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
diff --git a/tests/nlu/tokenizers/test_spacy_tokenizer.py b/tests/nlu/tokenizers/test_spacy_tokenizer.py
index 66e8a2b80919..0a3f7ecf42f7 100644
--- a/tests/nlu/tokenizers/test_spacy_tokenizer.py
+++ b/tests/nlu/tokenizers/test_spacy_tokenizer.py
@@ -4,10 +4,10 @@
 from rasa.nlu.training_data import Message
 from rasa.nlu.constants import (
     CLS_TOKEN,
-    TEXT_ATTRIBUTE,
+    TEXT,
     SPACY_DOCS,
-    INTENT_ATTRIBUTE,
-    RESPONSE_ATTRIBUTE,
+    INTENT,
+    RESPONSE,
     TOKENS_NAMES,
 )
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
@@ -32,15 +32,33 @@ def test_spacy(text, expected_tokens, expected_indices, spacy_nlp):
     tk = SpacyTokenizer()
 
     message = Message(text)
-    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(text))
+    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))
 
-    tokens = tk.tokenize(message, attribute=TEXT_ATTRIBUTE)
+    tokens = tk.tokenize(message, attribute=TEXT)
 
     assert [t.text for t in tokens] == expected_tokens
     assert [t.start for t in tokens] == [i[0] for i in expected_indices]
     assert [t.end for t in tokens] == [i[1] for i in expected_indices]
 
 
+@pytest.mark.parametrize(
+    "text, expected_pos_tags",
+    [
+        ("Forecast for lunch", ["NN", "IN", "NN"]),
+        ("Hello, how are you?", ["UH", ",", "WRB", "VBP", "PRP", "."]),
+    ],
+)
+def test_spacy_pos_tags(text, expected_pos_tags, spacy_nlp):
+    tk = SpacyTokenizer()
+
+    message = Message(text)
+    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))
+
+    tokens = tk.tokenize(message, attribute=TEXT)
+
+    assert [t.data.get("pos") for t in tokens] == expected_pos_tags
+
+
 @pytest.mark.parametrize(
     "text, expected_tokens, expected_indices",
     [
@@ -55,16 +73,16 @@ def test_train_tokenizer(text, expected_tokens, expected_indices, spacy_nlp):
     tk = SpacyTokenizer()
 
     message = Message(text)
-    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(text))
-    message.set(RESPONSE_ATTRIBUTE, text)
-    message.set(SPACY_DOCS[RESPONSE_ATTRIBUTE], spacy_nlp(text))
+    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))
+    message.set(RESPONSE, text)
+    message.set(SPACY_DOCS[RESPONSE], spacy_nlp(text))
 
     training_data = TrainingData()
     training_data.training_examples = [message]
 
     tk.train(training_data)
 
-    for attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE]:
+    for attribute in [RESPONSE, TEXT]:
         tokens = training_data.training_examples[0].get(TOKENS_NAMES[attribute])
 
         assert [t.text for t in tokens] == expected_tokens
@@ -85,11 +103,9 @@ def test_custom_intent_symbol(text, expected_tokens, spacy_nlp):
     tk = SpacyTokenizer(component_config)
 
     message = Message(text)
-    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(text))
-    message.set(INTENT_ATTRIBUTE, text)
+    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))
+    message.set(INTENT, text)
 
     tk.train(TrainingData([message]))
 
-    assert [
-        t.text for t in message.get(TOKENS_NAMES[INTENT_ATTRIBUTE])
-    ] == expected_tokens
+    assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
diff --git a/tests/nlu/tokenizers/test_tokenizer.py b/tests/nlu/tokenizers/test_tokenizer.py
index b69bc6af7adf..c526358466d1 100644
--- a/tests/nlu/tokenizers/test_tokenizer.py
+++ b/tests/nlu/tokenizers/test_tokenizer.py
@@ -2,9 +2,9 @@
 
 from rasa.nlu.constants import (
     CLS_TOKEN,
-    TEXT_ATTRIBUTE,
-    INTENT_ATTRIBUTE,
-    RESPONSE_ATTRIBUTE,
+    TEXT,
+    INTENT,
+    RESPONSE,
     TOKENS_NAMES,
 )
 from rasa.nlu.training_data import Message, TrainingData
@@ -40,15 +40,15 @@ def test_train_tokenizer(text, expected_tokens, expected_indices):
     tk = WhitespaceTokenizer()
 
     message = Message(text)
-    message.set(RESPONSE_ATTRIBUTE, text)
-    message.set(INTENT_ATTRIBUTE, text)
+    message.set(RESPONSE, text)
+    message.set(INTENT, text)
 
     training_data = TrainingData()
     training_data.training_examples = [message]
 
     tk.train(training_data)
 
-    for attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE]:
+    for attribute in [RESPONSE, TEXT]:
         tokens = training_data.training_examples[0].get(TOKENS_NAMES[attribute])
 
         assert [t.text for t in tokens] == expected_tokens
@@ -56,7 +56,7 @@ def test_train_tokenizer(text, expected_tokens, expected_indices):
         assert [t.end for t in tokens] == [i[1] for i in expected_indices]
 
     # check intent attribute
-    tokens = training_data.training_examples[0].get(TOKENS_NAMES[INTENT_ATTRIBUTE])
+    tokens = training_data.training_examples[0].get(TOKENS_NAMES[INTENT])
 
     assert [t.text for t in tokens] == [text]
 
@@ -78,7 +78,7 @@ def test_process_tokenizer(text, expected_tokens, expected_indices):
 
     tk.process(message)
 
-    tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])
+    tokens = message.get(TOKENS_NAMES[TEXT])
 
     assert [t.text for t in tokens] == expected_tokens
     assert [t.start for t in tokens] == [i[0] for i in expected_indices]
@@ -98,6 +98,6 @@ def test_split_intent(text, expected_tokens):
     tk = WhitespaceTokenizer(component_config)
 
     message = Message(text)
-    message.set(INTENT_ATTRIBUTE, text)
+    message.set(INTENT, text)
 
     assert [t.text for t in tk._split_intent(message)] == expected_tokens
diff --git a/tests/nlu/tokenizers/test_whitespace_tokenizer.py b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
index 5a7cff88ebaa..5cffefd2746f 100644
--- a/tests/nlu/tokenizers/test_whitespace_tokenizer.py
+++ b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
@@ -1,6 +1,6 @@
 import pytest
 
-from rasa.nlu.constants import TOKENS_NAMES, TEXT_ATTRIBUTE, INTENT_ATTRIBUTE
+from rasa.nlu.constants import TOKENS_NAMES, TEXT, INTENT
 from rasa.nlu.training_data import TrainingData, Message
 from tests.nlu import utilities
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
@@ -34,7 +34,7 @@ def test_whitespace(text, expected_tokens, expected_indices):
 
     tk = WhitespaceTokenizer()
 
-    tokens = tk.tokenize(Message(text), attribute=TEXT_ATTRIBUTE)
+    tokens = tk.tokenize(Message(text), attribute=TEXT)
 
     assert [t.text for t in tokens] == expected_tokens
     assert [t.start for t in tokens] == [i[0] for i in expected_indices]
@@ -54,13 +54,11 @@ def test_custom_intent_symbol(text, expected_tokens):
     tk = WhitespaceTokenizer(component_config)
 
     message = Message(text)
-    message.set(INTENT_ATTRIBUTE, text)
+    message.set(INTENT, text)
 
     tk.train(TrainingData([message]))
 
-    assert [
-        t.text for t in message.get(TOKENS_NAMES[INTENT_ATTRIBUTE])
-    ] == expected_tokens
+    assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
 
 
 @pytest.mark.parametrize(
@@ -77,14 +75,12 @@ def test_whitespace_with_case(text, component_config, expected_tokens):
 
     message = Message(text)
 
-    tokens = tk.tokenize(message, attribute=TEXT_ATTRIBUTE)
+    tokens = tk.tokenize(message, attribute=TEXT)
 
     assert [t.text for t in tokens] == expected_tokens
 
 
-def test_whitespace_training():
-    _config = utilities.base_test_conf("supervised_embeddings")
-
+def test_whitespace_training(supervised_embeddings_config):
     examples = [
         Message(
             "Any Mexican restaurant will do",
@@ -109,7 +105,7 @@ def test_whitespace_training():
     component_config = {"case_sensitive": False}
     tk = WhitespaceTokenizer(component_config)
 
-    tk.train(TrainingData(training_examples=examples), _config)
+    tk.train(TrainingData(training_examples=examples), supervised_embeddings_config)
 
     assert examples[0].data.get("tokens")[0].text == "any"
     assert examples[0].data.get("tokens")[1].text == "mexican"
diff --git a/tests/nlu/training_data/__init__.py b/tests/nlu/training_data/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/nlu/base/test_training_data.py b/tests/nlu/training_data/test_training_data.py
similarity index 99%
rename from tests/nlu/base/test_training_data.py
rename to tests/nlu/training_data/test_training_data.py
index 935d00354843..c0f7c05c1e2f 100644
--- a/tests/nlu/base/test_training_data.py
+++ b/tests/nlu/training_data/test_training_data.py
@@ -1,11 +1,10 @@
-import logging
 from typing import Optional, Text
 
 import pytest
 import tempfile
 from jsonschema import ValidationError
 
-from rasa.nlu.constants import TEXT_ATTRIBUTE
+from rasa.nlu.constants import TEXT
 from rasa.nlu import training_data
 from rasa.nlu.convert import convert_training_data
 from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
@@ -275,7 +274,7 @@ def test_repeated_entities():
         example = td.entity_examples[0]
         entities = example.get("entities")
         assert len(entities) == 1
-        tokens = WhitespaceTokenizer().tokenize(example, attribute=TEXT_ATTRIBUTE)
+        tokens = WhitespaceTokenizer().tokenize(example, attribute=TEXT)
         start, end = MitieEntityExtractor.find_entity(entities[0], example.text, tokens)
         assert start == 9
         assert end == 10
@@ -309,7 +308,7 @@ def test_multiword_entities():
         example = td.entity_examples[0]
         entities = example.get("entities")
         assert len(entities) == 1
-        tokens = WhitespaceTokenizer().tokenize(example, attribute=TEXT_ATTRIBUTE)
+        tokens = WhitespaceTokenizer().tokenize(example, attribute=TEXT)
         start, end = MitieEntityExtractor.find_entity(entities[0], example.text, tokens)
         assert start == 4
         assert end == 7
diff --git a/tests/nlu/utilities.py b/tests/nlu/utilities.py
index 6cf509ee435d..f06315263d9b 100644
--- a/tests/nlu/utilities.py
+++ b/tests/nlu/utilities.py
@@ -1,6 +1,4 @@
 import tempfile
-
-import pytest
 import ruamel.yaml as yaml
 
 from rasa.nlu.config import RasaNLUModelConfig
@@ -9,11 +7,6 @@
 
 
 def base_test_conf(pipeline_template):
-    # 'response_log': temp_log_file_dir(),
-    # 'port': 5022,
-    # "path": tempfile.mkdtemp(),
-    # "data": "./data/test/demo-rasa-small.json"
-
     return RasaNLUModelConfig({"pipeline": pipeline_template})
 
 
@@ -34,10 +27,6 @@ async def interpreter_for(component_builder, data, path, config):
     return interpreter
 
 
-def temp_log_file_dir():
-    return tempfile.mkdtemp(suffix="_rasa_nlu_logs")
-
-
 class ResponseTest:
     def __init__(self, endpoint, expected_response, payload=None):
         self.endpoint = endpoint
diff --git a/tests/nlu/utils/__init__.py b/tests/nlu/utils/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/nlu/utils/test_bilou_utils.py b/tests/nlu/utils/test_bilou_utils.py
new file mode 100644
index 000000000000..0efc08ecb5d8
--- /dev/null
+++ b/tests/nlu/utils/test_bilou_utils.py
@@ -0,0 +1,141 @@
+import pytest
+
+import rasa.nlu.utils.bilou_utils as bilou_utils
+from rasa.nlu.constants import BILOU_ENTITIES, ENTITIES
+from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+from rasa.nlu.training_data import TrainingData, Message
+
+
+@pytest.mark.parametrize(
+    "tag, expected",
+    [
+        ("B-person", "person"),
+        ("I-location", "location"),
+        ("location", "location"),
+        ("U-company", "company"),
+        ("L-company", "company"),
+    ],
+)
+def test_entity_name_from_tag(tag, expected):
+    actual = bilou_utils.entity_name_from_tag(tag)
+
+    assert actual == expected
+
+
+@pytest.mark.parametrize(
+    "tag, expected",
+    [
+        ("B-person", "B"),
+        ("I-location", "I"),
+        ("location", None),
+        ("U-company", "U"),
+        ("L-company", "L"),
+        ("O-company", None),
+    ],
+)
+def test_bilou_from_tag(tag, expected):
+    actual = bilou_utils.bilou_prefix_from_tag(tag)
+
+    assert actual == expected
+
+
+def test_tags_to_ids():
+    message = Message("Germany is part of the European Union")
+    message.set(
+        BILOU_ENTITIES,
+        ["U-location", "O", "O", "O", "O", "B-organisation", "L-organisation"],
+    )
+
+    tag_id_dict = {"O": 0, "U-location": 1, "B-organisation": 2, "L-organisation": 3}
+
+    tags = bilou_utils.tags_to_ids(message, tag_id_dict)
+
+    assert tags == [1, 0, 0, 0, 0, 2, 3]
+
+
+def test_remove_bilou_prefixes():
+    actual = bilou_utils.remove_bilou_prefixes(
+        ["U-location", "O", "O", "O", "O", "B-organisation", "L-organisation"]
+    )
+
+    assert actual == ["location", "O", "O", "O", "O", "organisation", "organisation"]
+
+
+def test_build_tag_id_dict():
+    message_1 = Message("Germany is part of the European Union")
+    message_1.set(
+        BILOU_ENTITIES,
+        ["U-location", "O", "O", "O", "O", "B-organisation", "L-organisation"],
+    )
+
+    message_2 = Message("Berlin is the capital of Germany")
+    message_2.set(BILOU_ENTITIES, ["U-location", "O", "O", "O", "O", "U-location"])
+
+    training_data = TrainingData([message_1, message_2])
+
+    tag_id_dict = bilou_utils.build_tag_id_dict(training_data)
+
+    assert tag_id_dict == {
+        "O": 0,
+        "B-location": 1,
+        "I-location": 2,
+        "U-location": 3,
+        "L-location": 4,
+        "B-organisation": 5,
+        "I-organisation": 6,
+        "U-organisation": 7,
+        "L-organisation": 8,
+    }
+
+
+def test_apply_bilou_schema():
+    tokenizer = WhitespaceTokenizer()
+
+    message_1 = Message("Germany is part of the European Union")
+    message_1.set(
+        ENTITIES,
+        [
+            {"start": 0, "end": 7, "value": "Germany", "entity": "location"},
+            {
+                "start": 23,
+                "end": 37,
+                "value": "European Union",
+                "entity": "organisation",
+            },
+        ],
+    )
+
+    message_2 = Message("Berlin is the capital of Germany")
+    message_2.set(
+        ENTITIES,
+        [
+            {"start": 0, "end": 6, "value": "Berlin", "entity": "location"},
+            {"start": 25, "end": 32, "value": "Germany", "entity": "location"},
+        ],
+    )
+
+    training_data = TrainingData([message_1, message_2])
+
+    tokenizer.train(training_data)
+
+    bilou_utils.apply_bilou_schema(training_data)
+
+    assert message_1.get(BILOU_ENTITIES) == [
+        "U-location",
+        "O",
+        "O",
+        "O",
+        "O",
+        "B-organisation",
+        "L-organisation",
+        "O",
+    ]
+    assert message_2.get(BILOU_ENTITIES) == [
+        "U-location",
+        "O",
+        "O",
+        "O",
+        "O",
+        "U-location",
+        "O",
+    ]
diff --git a/tests/test_train.py b/tests/test_train.py
index af9dc02412f5..7a71ae0370e5 100644
--- a/tests/test_train.py
+++ b/tests/test_train.py
@@ -1,11 +1,9 @@
 import tempfile
 import os
-import shutil
 from typing import Text
 
 import pytest
 from _pytest.monkeypatch import MonkeyPatch
-from _pytest.tmpdir import TempdirFactory
 
 import rasa.model
 
diff --git a/tests/utilities.py b/tests/utilities.py
index 6c334f75905e..913be3ea672d 100644
--- a/tests/utilities.py
+++ b/tests/utilities.py
@@ -1,4 +1,12 @@
 from yarl import URL
+from typing import Text
+
+import rasa.utils.io as io_utils
+
+from rasa.nlu.classifiers.diet_classifier import DIETClassifier
+from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
+from rasa.nlu.selectors.response_selector import ResponseSelector
+from rasa.utils.tensorflow.constants import EPOCHS
 
 
 def latest_request(mocked, request_type, path):
@@ -7,3 +15,24 @@ def latest_request(mocked, request_type, path):
 
 def json_of_latest_request(r):
     return r[-1].kwargs["json"]
+
+
+def update_number_of_epochs(config_path: Text, output_file: Text):
+    config = io_utils.read_yaml_file(config_path)
+
+    if "pipeline" not in config.keys():
+        raise ValueError(f"Invalid config provided! File: '{config_path}'.")
+
+    for component in config["pipeline"]:
+        # do not update epochs for pipeline templates
+        if not isinstance(component, dict):
+            continue
+
+        if component["name"] in [
+            EmbeddingIntentClassifier.name,
+            DIETClassifier.name,
+            ResponseSelector.name,
+        ]:
+            component[EPOCHS] = 1
+
+    io_utils.write_yaml_file(config, output_file)
diff --git a/tests/utils/tensorflow/__init__.py b/tests/utils/tensorflow/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/utils/tensorflow/test_tf_environment.py b/tests/utils/tensorflow/test_tf_environment.py
new file mode 100644
index 000000000000..3f30570d7975
--- /dev/null
+++ b/tests/utils/tensorflow/test_tf_environment.py
@@ -0,0 +1,11 @@
+import pytest
+from typing import Text, Dict
+from rasa.utils.tensorflow.environment import _parse_gpu_config
+
+
+@pytest.mark.parametrize(
+    "gpu_config_string, parsed_gpu_config",
+    [("0: 1024", {0: 1024}), ("0:1024, 1:2048", {0: 1024, 1: 2048})],
+)
+def test_gpu_config_parser(gpu_config_string: Text, parsed_gpu_config: Dict[int, int]):
+    assert _parse_gpu_config(gpu_config_string) == parsed_gpu_config
diff --git a/tests/utils/test_model_data.py b/tests/utils/test_model_data.py
new file mode 100644
index 000000000000..627b37c23f1a
--- /dev/null
+++ b/tests/utils/test_model_data.py
@@ -0,0 +1,189 @@
+import copy
+
+import pytest
+import scipy.sparse
+import numpy as np
+
+from rasa.utils.tensorflow.model_data import RasaModelData
+
+
+@pytest.fixture
+async def model_data() -> RasaModelData:
+    return RasaModelData(
+        label_key="intent_ids",
+        data={
+            "text_features": [
+                np.array(
+                    [
+                        np.random.rand(5, 14),
+                        np.random.rand(2, 14),
+                        np.random.rand(3, 14),
+                        np.random.rand(1, 14),
+                        np.random.rand(3, 14),
+                    ]
+                ),
+                np.array(
+                    [
+                        scipy.sparse.csr_matrix(np.random.randint(5, size=(5, 10))),
+                        scipy.sparse.csr_matrix(np.random.randint(5, size=(2, 10))),
+                        scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
+                        scipy.sparse.csr_matrix(np.random.randint(5, size=(1, 10))),
+                        scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
+                    ]
+                ),
+            ],
+            "intent_features": [
+                np.array(
+                    [
+                        np.random.randint(2, size=(5, 10)),
+                        np.random.randint(2, size=(2, 10)),
+                        np.random.randint(2, size=(3, 10)),
+                        np.random.randint(2, size=(1, 10)),
+                        np.random.randint(2, size=(3, 10)),
+                    ]
+                )
+            ],
+            "intent_ids": [np.array([0, 1, 0, 1, 1])],
+            "tag_ids": [
+                np.array(
+                    [
+                        np.array([[0], [1], [1], [0], [2]]),
+                        np.array([[2], [0]]),
+                        np.array([[0], [1], [1]]),
+                        np.array([[0], [1]]),
+                        np.array([[0], [0], [0]]),
+                    ]
+                )
+            ],
+        },
+    )
+
+
+def test_shuffle_session_data(model_data: RasaModelData):
+    before = copy.copy(model_data)
+
+    # precondition
+    assert np.all(
+        np.array(list(before.values())) == np.array(list(model_data.values()))
+    )
+
+    data = model_data._shuffled_data(model_data.data)
+
+    # check that original data didn't change
+    assert np.all(
+        np.array(list(before.values())) == np.array(list(model_data.values()))
+    )
+    # check that new data is different
+    assert np.all(np.array(model_data.values()) != np.array(data.values()))
+
+
+def test_split_data_by_label(model_data: RasaModelData):
+    split_model_data = model_data._split_by_label_ids(
+        model_data.data, model_data.get("intent_ids")[0], np.array([0, 1])
+    )
+
+    assert len(split_model_data) == 2
+    for s in split_model_data:
+        assert len(set(s.get("intent_ids")[0])) == 1
+
+
+def test_split_data_by_none_label(model_data: RasaModelData):
+    model_data.label_key = None
+
+    split_model_data = model_data.split(2, 42)
+
+    assert len(split_model_data) == 2
+
+    train_data = split_model_data[0]
+    test_data = split_model_data[1]
+
+    # train data should have 3 examples
+    assert len(train_data.get("intent_ids")[0]) == 3
+    # test data should have 2 examples
+    assert len(test_data.get("intent_ids")[0]) == 2
+
+
+def test_train_val_split(model_data: RasaModelData):
+    train_model_data, test_model_data = model_data.split(2, 42)
+
+    for k, values in model_data.items():
+        assert len(values) == len(train_model_data.get(k))
+        assert len(values) == len(test_model_data.get(k))
+        for i, v in enumerate(values):
+            assert v[0].dtype == train_model_data.get(k)[i][0].dtype
+
+    for values in train_model_data.values():
+        for v in values:
+            assert v.shape[0] == 3
+
+    for values in test_model_data.values():
+        for v in values:
+            assert v.shape[0] == 2
+
+
+@pytest.mark.parametrize("size", [0, 1, 5])
+def test_train_val_split_incorrect_size(model_data: RasaModelData, size: int):
+    with pytest.raises(ValueError):
+        model_data.split(size, 42)
+
+
+def test_session_data_for_ids(model_data: RasaModelData):
+    filtered_data = model_data._data_for_ids(model_data.data, np.array([0, 1]))
+
+    for values in filtered_data.values():
+        for v in values:
+            assert v.shape[0] == 2
+
+    k = list(model_data.keys())[0]
+
+    assert np.all(np.array(filtered_data[k][0][0]) == np.array(model_data.get(k)[0][0]))
+    assert np.all(np.array(filtered_data[k][0][1]) == np.array(model_data.get(k)[0][1]))
+
+
+def test_get_number_of_examples(model_data: RasaModelData):
+    assert model_data.number_of_examples() == 5
+
+
+def test_get_number_of_examples_raises_value_error(model_data: RasaModelData):
+    model_data.data["dense"] = [np.random.randint(5, size=(2, 10))]
+    with pytest.raises(ValueError):
+        model_data.number_of_examples()
+
+
+def test_gen_batch(model_data: RasaModelData):
+    iterator = model_data._gen_batch(2, shuffle=True, batch_strategy="balanced")
+    print(model_data.data["tag_ids"][0])
+    batch = next(iterator)
+    assert len(batch) == 7
+    assert len(batch[0]) == 2
+
+    batch = next(iterator)
+    assert len(batch) == 7
+    assert len(batch[0]) == 2
+
+    batch = next(iterator)
+    assert len(batch) == 7
+    assert len(batch[0]) == 1
+
+    with pytest.raises(StopIteration):
+        next(iterator)
+
+
+def test_balance_model_data(model_data: RasaModelData):
+    data = model_data._balanced_data(model_data.data, 2, False)
+
+    assert np.all(data.get("intent_ids")[0] == np.array([0, 1, 1, 0, 1]))
+
+
+def test_not_balance_model_data(model_data: RasaModelData):
+    test_model_data = RasaModelData(label_key="tag_ids", data=model_data.data)
+
+    data = test_model_data._balanced_data(test_model_data.data, 2, False)
+
+    assert np.all(data.get("tag_ids") == test_model_data.get("tag_ids"))
+
+
+def test_get_num_of_features(model_data: RasaModelData):
+    num_features = model_data.feature_dimension("text_features")
+
+    assert num_features == 24
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
deleted file mode 100644
index bdbb89cde492..000000000000
--- a/tests/utils/test_train_utils.py
+++ /dev/null
@@ -1,167 +0,0 @@
-import pytest
-import scipy.sparse
-import numpy as np
-
-from rasa.utils.train_utils import (
-    SessionDataType,
-    shuffle_session_data,
-    split_session_data_by_label_ids,
-    train_val_split,
-    session_data_for_ids,
-    get_number_of_examples,
-    gen_batch,
-    balance_session_data,
-)
-
-
-@pytest.fixture
-async def session_data() -> SessionDataType:
-    return {
-        "text_features": [
-            np.array(
-                [
-                    np.random.rand(5, 14),
-                    np.random.rand(2, 14),
-                    np.random.rand(3, 14),
-                    np.random.rand(1, 14),
-                    np.random.rand(3, 14),
-                ]
-            ),
-            np.array(
-                [
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(5, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(2, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(1, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
-                ]
-            ),
-        ],
-        "intent_features": [
-            np.array(
-                [
-                    np.random.randint(2, size=(5, 10)),
-                    np.random.randint(2, size=(2, 10)),
-                    np.random.randint(2, size=(3, 10)),
-                    np.random.randint(2, size=(1, 10)),
-                    np.random.randint(2, size=(3, 10)),
-                ]
-            )
-        ],
-        "intent_ids": [np.array([0, 1, 0, 1, 1])],
-        "tag_ids": [
-            np.array(
-                [
-                    np.array([0, 1, 1, 0, 2]),
-                    np.array([2, 0]),
-                    np.array([0, 1, 1]),
-                    np.array([0, 1]),
-                    np.array([0, 0, 0]),
-                ]
-            )
-        ],
-    }
-
-
-def test_shuffle_session_data(session_data: SessionDataType):
-    shuffeled_session_data = shuffle_session_data(session_data)
-
-    assert np.array(shuffeled_session_data.values()) != np.array(session_data.values())
-
-
-def test_split_session_data_by_label(session_data: SessionDataType):
-    split_session_data = split_session_data_by_label_ids(
-        session_data, session_data["intent_ids"][0], np.array([0, 1])
-    )
-
-    assert len(split_session_data) == 2
-    for s in split_session_data:
-        assert len(set(s["intent_ids"][0])) == 1
-
-
-def test_train_val_split(session_data: SessionDataType):
-    train_session_data, val_session_data = train_val_split(
-        session_data, 2, 42, "intent_ids"
-    )
-
-    for k, values in session_data.items():
-        assert len(values) == len(train_session_data[k])
-        assert len(values) == len(val_session_data[k])
-        for i, v in enumerate(values):
-            assert v[0].dtype == train_session_data[k][i][0].dtype
-
-    for values in train_session_data.values():
-        for v in values:
-            assert v.shape[0] == 3
-
-    for values in val_session_data.values():
-        for v in values:
-            assert v.shape[0] == 2
-
-
-@pytest.mark.parametrize("size", [0, 1, 5])
-def test_train_val_split_incorrect_size(session_data: SessionDataType, size):
-    with pytest.raises(ValueError):
-        train_val_split(session_data, size, 42, "intent_ids")
-
-
-def test_session_data_for_ids(session_data: SessionDataType):
-    filtered_session_data = session_data_for_ids(session_data, np.array([0, 1]))
-
-    for values in filtered_session_data.values():
-        for v in values:
-            assert v.shape[0] == 2
-
-    k = list(session_data.keys())[0]
-
-    assert np.all(
-        np.array(filtered_session_data[k][0][0]) == np.array(session_data[k][0][0])
-    )
-    assert np.all(
-        np.array(filtered_session_data[k][0][1]) == np.array(session_data[k][0][1])
-    )
-
-
-def test_get_number_of_examples(session_data: SessionDataType):
-    num = get_number_of_examples(session_data)
-
-    assert num == 5
-
-
-def test_get_number_of_examples_raises_value_error(session_data: SessionDataType):
-    session_data["dense"] = np.random.randint(5, size=(2, 10))
-    with pytest.raises(ValueError):
-        get_number_of_examples(session_data)
-
-
-def test_gen_batch(session_data: SessionDataType):
-    iterator = gen_batch(
-        session_data, 2, "intent_ids", shuffle=True, batch_strategy="balanced"
-    )
-
-    batch = next(iterator)
-    assert len(batch) == 7
-    assert len(batch[0]) == 2
-
-    batch = next(iterator)
-    assert len(batch) == 7
-    assert len(batch[0]) == 2
-
-    batch = next(iterator)
-    assert len(batch) == 7
-    assert len(batch[0]) == 1
-
-    with pytest.raises(StopIteration):
-        next(iterator)
-
-
-def test_balance_session_data(session_data: SessionDataType):
-    balanced_session_data = balance_session_data(session_data, 2, False, "intent_ids")
-
-    for k, values in session_data.items():
-        assert k in balanced_session_data
-
-        for i, v in enumerate(values):
-            assert len(v) == len(balanced_session_data[k][i])
-
-    assert np.all(balanced_session_data["intent_ids"][0] == np.array([0, 1, 1, 0, 1]))
diff --git a/tests/utils/test_validation.py b/tests/utils/test_validation.py
index d691d07a2036..dc34c1ec64a3 100644
--- a/tests/utils/test_validation.py
+++ b/tests/utils/test_validation.py
@@ -10,9 +10,9 @@
     "file, schema",
     [
         ("examples/restaurantbot/domain.yml", DOMAIN_SCHEMA_FILE),
-        ("sample_configs/config_defaults.yml", CONFIG_SCHEMA_FILE),
-        ("sample_configs/config_supervised_embeddings.yml", CONFIG_SCHEMA_FILE),
-        ("sample_configs/config_crf_custom_features.yml", CONFIG_SCHEMA_FILE),
+        ("data/test_config/config_defaults.yml", CONFIG_SCHEMA_FILE),
+        ("data/test_config/config_supervised_embeddings.yml", CONFIG_SCHEMA_FILE),
+        ("data/test_config/config_crf_custom_features.yml", CONFIG_SCHEMA_FILE),
     ],
 )
 def test_validate_yaml_schema(file, schema):